Plasma GitLab Archive
Projects Blog Knowledge

(* $Id: pxp_reader.ml 689 2004-08-07 17:01:52Z gerd $
 * ----------------------------------------------------------------------
 * PXP: The polymorphic XML parser for Objective Caml.
 * Copyright by Gerd Stolpmann. See LICENSE for details.
 *)

open Pxp_core_types;;
open Netchannels;;

exception Not_competent = Pxp_core_types.Not_competent;;
exception Not_resolvable (* of exn *) = Pxp_core_types.Not_resolvable;;

type lexer_source =
    { lsrc_lexbuf : Lexing.lexbuf Lazy.t;
      lsrc_unicode_lexbuf : Netulex.ULB.unicode_lexbuf Lazy.t;
    }



let ensure_space_minimum p f g =
  (* The functional [f] is called with a function as argument that refills
   * a buffer string (think f = Lexing.from_function). This argument is [g],
   * i.e. in most cases this is the same as [f g]. However, it is ensured
   * that the string buffer has the minimum free space [p]. This is achieved
   * by using an auxiliary buffer.
   *)
  let buf = String.create p in
  let bufpos = ref 0 in
  let buflen = ref 0 in
  f (fun s n ->
       assert(n>0);
       if !buflen > 0 then (
	 let m = min n !buflen in
	 String.blit buf !bufpos s 0 m;
	 bufpos := !bufpos + m;
	 buflen := !buflen - m;
	 m
       )
       else
	 if n < p then (
	   let l = g buf p in
	   let m = min l n in
	   String.blit buf 0 s 0 m;
	   bufpos := m;
	   buflen := l-m;
	   m
	 )
	 else 
	   g s n
    )
;;


class type resolver =
  object
    method init_rep_encoding : rep_encoding -> unit
    method init_warner : symbolic_warnings option -> collect_warnings -> unit
    method rep_encoding : rep_encoding
    method open_in : ext_id -> lexer_source
    method open_rid : resolver_id -> lexer_source
    method active_id : resolver_id
    method close_in : unit
    (* method close_all : unit *)
    (* [close_all] is no longer supported in PXP 1.2 *)
    method change_encoding : string -> unit
    method clone : resolver
  end
;;


let null_resolver = resolver_id_of_ext_id Anonymous ;;
(* All components are None *)


class virtual resolve_general
 =
  object (self)
    val mutable internal_encoding = `Enc_utf8

    val mutable is_open = false

    val mutable mode = None
		(* Whether the [lexbuf_reader] or the [unicode_lexbuf_reader]
		 * is used. One can only invoke one of them.
		 *)

    val mutable encoding = `Enc_utf8
    val mutable encoding_requested = false
    val mutable encoding_request_post = (fun () -> ()) (* post action *)

    val mutable active_id = null_resolver

    val mutable warner = new drop_warnings
    val mutable swarner = None

    val mutable enc_initialized = false
    val mutable wrn_initialized = false


    method init_rep_encoding e =
      internal_encoding <- e;
      enc_initialized <- true;

    method init_warner sw w =
      warner <- w;
      swarner <- sw;
      wrn_initialized <- true;

    method rep_encoding = (internal_encoding :> rep_encoding)


    method private warn (k:int) =
      (* Called if a character not representable has been found.
       * k is the character code.
       *)
	if k < 0xd800 or (k >= 0xe000 & k <= 0xfffd) or
	   (k >= 0x10000 & k <= 0x10ffff) then begin
	     warn swarner warner (`W_code_point_cannot_be_represented k);
	   end
	else
	  raise (WF_error("Code point " ^ string_of_int k ^
		    " outside the accepted range of code points"))


    method private autodetect s =
      (* s must be at least 4 bytes long. The slot 'encoding' is
       * set to:
       * "UTF-16-BE": UTF-16/UCS-2 encoding big endian
       * "UTF-16-LE": UTF-16/UCS-2 encoding little endian
       * "UTF-8":     UTF-8 encoding
       *
       * Note: Four bytes are required for cases not yet handled 
       * (e.g. UTF-32).
       *)
      if String.sub s 0 2 = "\254\255" then
	encoding <- `Enc_utf16
	  (* Note: Netconversion.recode will detect the big endianess, too *)
      else if String.sub s 0 2 = "\255\254" then
	encoding <- `Enc_utf16
	  (* Note: Netconversion.recode will detect the little endianess, too *)
      else
	encoding <- `Enc_utf8


    method private virtual next_string : string -> int -> int -> int
    method private virtual init_in : resolver_id -> unit
    method virtual close_in : unit
      (* must reset is_open! *)


    method private lexbuf_reader () =
      if mode <> None && mode <> Some `Lexbuf then 
	failwith "lexbuf_reader: other reader already working";

      mode <- Some `Lexbuf;
      let direct_reader = ref false in  (* whether to bypass the buffer *)

      let buf_max = 4096 in
      let buf     = ref (String.make buf_max ' ')  in
      let buf_beg = ref 0 in
      let buf_end = ref 0 in
      let buf_eof = ref false in
      (* The buffer is used if [not direct_reader]. [buf_beg] is the 
       * beginning of the filled part of the buffer, [buf_end] is the
       * end (plus 1). [buf_eof] indicates that EOF was already signaled,
       * and should be processed after the current contents.
       *)

      let refill() =
	(* Refill the buffer from [next_string].
	 * It is important to refill as much as possible for the
	 * algorithm below.
	 *)
	while not !buf_eof && !buf_end < buf_max do
	  let n = self # next_string !buf !buf_end (buf_max - !buf_end) in
	  if n=0 then
	    buf_eof := true
	  else
	    buf_end := !buf_end + n
	done
      in

      let convert s n max_chars =
	(* Convert characters from [buf] to [s], [n]. The number of characters
	 * it limited by [max_chars]
	 *)
	assert(n>=6);
	if !buf_beg + 6 > !buf_end && not !buf_eof then (
	  (* Less than 6 bytes in [buf]. We need at least this
	   * number of bytes, which is the length of the longest
	   * UTF-8 char, otherwise we cannot ensure to convert
	   * at least one character.
	   *)
	  let m = !buf_end - !buf_beg in
	  String.blit !buf !buf_beg !buf 0 m;
	  buf_beg := 0;
	  buf_end := m;
	  refill();
	);

	(* It is still possible that there are less than 6 bytes
	 * in [buf], but only if [buf_eof] is true at the same
	 * time.
	 *)
	
	let m = !buf_end - !buf_beg in
	if m=0 then (
	  assert !buf_eof;
	  0  (* EOF! *)
	)
	else (
	  let (n_in, n_out, encoding') =
	    Netconversion.recode
	      ~in_enc:encoding
	      ~in_buf:!buf
	      ~in_pos:!buf_beg
	      ~in_len:m
	      ~out_enc:(internal_encoding : rep_encoding :> encoding)
	      ~out_buf:s
	      ~out_pos:0
	      ~out_len:n
	      ~max_chars
	      ~subst:(fun k -> self # warn k; "") in
	  if n_in = 0 then (
	    (* An incomplete character at the end of the stream. 
	     * Note: This test assumes that there is one character to 
	     * convert in [buf], and that there is enough free space in [s].
	     * This is the case because we ensure [m>=6] and [n>=6].
	     *)
	    assert !buf_eof;
	    raise Netconversion.Malformed_code;
	  );
	  encoding <- encoding';
	  buf_beg := !buf_beg + n_in;
	  
	  assert(n_out > 0);
	  n_out
	)
      in

      (* Fill the buffer initially. We start always with buffered reading,
       * but we try to switch to direct reading later.
       *)
      refill();
      if !buf_end >= 4 && not encoding_requested then self # autodetect !buf;

      (* Ensure that [n >= 6], the longest UTF-8 character, so we can always
       * put at least one character into [s]
       *)
      ensure_space_minimum 6
	Lexing.from_function
	(fun s n ->
	   (* Fill the string [s] with at most [n] bytes. Return the number
	    * of bytes, or 0 to signal EOF.
	    *)
	   if not is_open then
	     failwith "trying to read from resolver, but resolver is not open";
	   
	   if !direct_reader then
	     self # next_string s 0 n 
	   else (
	     if encoding_requested then (
	       (* In this case, the encoding will not change any more. We
		* can read any number of characters at once.
		*)
	       if encoding = (internal_encoding : rep_encoding :> encoding)
	       then (
		 (* No conversion is needed. In order to speed up reading,
		  * we are going to enable [direct_reader].
		  *)
		 if !buf_beg < !buf_end then (
		   (* There are still bytes in [buf], return them first *)
		   let m = min n (!buf_end - !buf_beg) in
		   String.blit !buf !buf_beg s 0 m;
		   buf_beg := !buf_beg + m;
		   m
		 )
		 else (
		   (* Either we are already at EOF, or we can switch to
		    * [direct_reader].
		    *)
		   buf := "";  (* Free buf, it will never be used again *)
		   if !buf_eof then
		     0
		   else (
		     direct_reader := true;
		     self # next_string s 0 n
		   )
		 )
	       )
	       else (
		 (* Character conversion is needed from [encoding] to
		  * [internal_encoding].
		  *)
		 convert s n max_int
	       )
	     )
	     else (
	       (* In this case, the encoding might change at any time.
		* Because of this, we only read one character at a time.
		*)
	       convert s n 1
	     )
	   )
	)


    method private unicode_lexbuf_reader () =
      if mode <> None && mode <> Some `Unicode_lexbuf then 
	failwith "unicode_lexbuf_reader: other reader already working";

      mode <- Some `Unicode_lexbuf;

      let buf = Netbuffer.create 4 in
      (* Only used for autodetection! *)

      let buf_eof = ref false in

      if not encoding_requested then (
	while not !buf_eof && Netbuffer.length buf < 4 do
	  let n =
	    Netbuffer.add_inplace ~len:(4-Netbuffer.length buf) buf self#next_string
	  in
	  buf_eof := (n=0)
	done;
	if Netbuffer.length buf >= 4  then
	  self # autodetect (Netbuffer.contents buf);
      );

      let lexbuf =
	Netulex.ULB.from_function
	  ~enc_change_hook:(
	    fun ulb -> 
	      encoding <- ulb.Netulex.ULB.ulb_encoding )
	  ~refill:(
	    fun s p n ->
	      (* Fill the string [s] at position [p] with at most [n] bytes. 
	       * Return the number of bytes, or 0 to signal EOF.
	       *)
	      if not is_open then
		failwith "trying to read from resolver, but resolver is not open";
	      
	      if encoding_requested && Netbuffer.length buf = 0 then (
		(* In this case, the encoding will not change any more. We
		 * can read any number of characters at once.
		 *)
		self # next_string s p n
	      )
	      else (
		(* In this case, the encoding might change at any time.
		 * Because of this, we only read one character at a time.
		 * If there are still characters in [buf], take these first.
		 *)
		
		if Netbuffer.length buf > 0 then (
		  let c = (Netbuffer.contents buf).[0] in
		  s.[p] <- c;
		  Netbuffer.delete buf 0 1;
		  1
		)

		else (
		  if !buf_eof then
		    0 (* EOF already seen *)
		  else
		    self # next_string s p 1
		)
	      )
	  )
	  encoding
      in

      encoding_request_post <- (
	fun () ->
	  (* This function is called after the encoding was requested for
	   * the first time
	   *)  
	  Netulex.ULB.set_encoding encoding lexbuf
      );

      lexbuf

    method open_in xid =
      self # open_rid (resolver_id_of_ext_id xid)

    method open_rid rid =
      assert(enc_initialized && wrn_initialized);

      encoding <- `Enc_utf8;
      encoding_requested <- false;
      self # init_in rid;         (* may raise Not_competent *)
      (* init_in: may already set 'encoding' *)

      is_open <- true;

      { lsrc_lexbuf         = lazy(self # lexbuf_reader() );
	lsrc_unicode_lexbuf = lazy(self # unicode_lexbuf_reader() );
      }

    method change_encoding enc =
      if not is_open then
	failwith "#change_encoding: resolver is not open";
      if not encoding_requested then begin
	if enc <> "" then begin
	  match Netconversion.encoding_of_string enc with
	      `Enc_utf16 ->
		(match encoding with
		     (`Enc_utf16_le | `Enc_utf16_be) -> ()
		   | `Enc_utf16 -> assert false
		   | _ ->
		       raise(WF_error "Encoding of data stream and encoding declaration mismatch")
		)
	    | e ->
		encoding <- e
	end;
	(* else: the autodetected encoding counts *)
	encoding_requested <- true;
	encoding_request_post();
      end;

    method active_id = 
      if not is_open then failwith "#active_id: resolver is not open";
      active_id

  end
;;

type accepted_id =
    in_obj_channel * encoding option * resolver_id option
;;

let close_ch (ch : in_obj_channel) = ch # close_in() ;;

class resolve_to_any_obj_channel ?(close=close_ch) ~channel_of_id () =
object(self)
    inherit resolve_general as super

    val f_open = channel_of_id
    val mutable current_channel = None
    val close = close

    method private init_in (id:resolver_id) =
      if current_channel <> None then
	failwith "Pxp_reader.resolve_to_any_obj_channel # init_in";
      let ch, enc_opt, rid_opt = f_open id in    (* may raise Not_competent *)
      begin match enc_opt with
	  None     -> ()
	| Some enc -> encoding <- enc; encoding_requested <- true
      end;
      begin match rid_opt with
	  None     -> active_id <- id
	| Some r   -> active_id <- r
      end;
      current_channel <- Some ch;

    method private next_string s ofs len =
      match current_channel with
	  None -> failwith "Pxp_reader.resolve_read_any_channel # next_string"
	| Some ch ->
	    try
	      ch # input s ofs len
	    with
		End_of_file -> 0

    method close_in =
      is_open <- false;
      match current_channel with
	  None -> ()
	| Some ch ->
	    close ch;
	    current_channel <- None

    method clone =
      let c = new resolve_to_any_obj_channel
		?close:(Some close) ~channel_of_id:f_open () in
      c # init_rep_encoding internal_encoding;
      c # init_warner swarner warner;
      (* clones <- c :: clones; *)
      (c :> resolver)

end
;;


let rid_rid_intersection bound_rid actual_rid =
  (* Returns a resolver_id where unequal IDs are reset to None. The
   * rid_system_base is set corresponding to rid_system.
   *
   * Notes: 
   * (1) an empty SYSTEM name does not match another empty SYSTEM name
   * (2) PUBLIC names must be normalized
   *)
  let isect opt1 opt2 =
    if opt1 = opt2 then opt1 else None
  in
  let sys_isect opt1 opt2 =
    if opt1 = opt2 && opt1 <> Some "" then opt1 else None
  in
  { rid_private = isect bound_rid.rid_private actual_rid.rid_private;
    rid_public  = isect bound_rid.rid_public actual_rid.rid_public;
    rid_system  = sys_isect bound_rid.rid_system actual_rid.rid_system;
    rid_system_base = if bound_rid.rid_system = actual_rid.rid_system  &&
                         bound_rid.rid_system <> None &&
                         bound_rid.rid_system <> Some ""
                      then
                        actual_rid.rid_system_base
                      else
			None;
  }
;;


let rid_matches_rid bound_rid actual_rid =
  (* definition:
   * rid_matches_rid r1 r2 =def= 
   * rid_rid_intersection r1 r2 <> null_resolver
   *
   * See also the notes for rid_rid_intersection
   *)
  (bound_rid.rid_private <> None && 
   bound_rid.rid_private = actual_rid.rid_private) ||
  (bound_rid.rid_public <> None && 
   bound_rid.rid_public = actual_rid.rid_public) ||
  (bound_rid.rid_system <> None && 
   bound_rid.rid_system <> Some "" && 
   bound_rid.rid_system = actual_rid.rid_system)
;;


let xid_rid_intersection bound_xid actual_rid =
  rid_rid_intersection (resolver_id_of_ext_id bound_xid) actual_rid
;;


let xid_matches_rid bound_xid actual_rid =
  (* definition:
   * xid_matches_rid x r =def= 
   * xid_rid_intersection x r <> null_resolver
   *
   * See also the notes for rid_rid_intersection
   *)
  match bound_xid with
      System sys -> 
	sys <> "" && actual_rid.rid_system = Some sys
    | Public(pub,sys) ->
	(actual_rid.rid_public = Some pub) || 
	(sys <> "" && actual_rid.rid_system = Some sys)
    | Anonymous ->
	false
    | Private p ->
	actual_rid.rid_private = Some p
;;


let id_intersection bound_rid_opt bound_xid_opt actual_rid =
  (* Intersections the actual_rid with both bound_rid_opt and bound_xid_opt
   * in turn
   *)
  let rid1 =
    match bound_rid_opt with
	Some bound_rid -> rid_rid_intersection bound_rid actual_rid
      | None           -> actual_rid
  in
  let rid2 =
    match bound_xid_opt with
	Some bound_xid -> xid_rid_intersection bound_xid rid1
      | None           -> rid1
  in
  rid2
;;


class resolve_to_this_obj_channel1 is_stale ?id ?rid ?fixenc ?close ch =

  let getchannel = ref (fun rid -> assert false) in

  object (self)
    inherit resolve_to_any_obj_channel
              ?close
	      ~channel_of_id:(fun rid -> !getchannel rid)
	      ()
	      as super

    val is_stale = is_stale
      (* The channel can only be read once. To avoid that the channel
       * is opened several times, the flag 'is_stale' is set after the
       * first time.
       *)

    val fixid = id
    val fixrid = rid
    val fixenc = fixenc
    val fixch = ch

    initializer
      getchannel := self # getchannel

    method private getchannel rid =
      let m_xid =
	match fixid with
	    None -> false
	  | Some bound_xid ->
	      xid_matches_rid bound_xid rid
      in
      let m_rid =
	match fixrid with
	    None -> false
	  | Some bound_rid ->
	      rid_matches_rid bound_rid rid
      in
      (* By definition, if both fixid and fixrid are None, the resolver
       * will match always.
       *)
      if (fixid <> None || fixrid <> None) && not m_xid && not m_rid then
	raise Not_competent;
      let final_id = id_intersection fixrid fixid rid in
      ch, fixenc, Some final_id

    method private init_in (rid:resolver_id) =
      if !is_stale then
	raise Not_competent
      else begin
	super # init_in rid;
	is_stale := true
      end

    method clone =
      let c = new resolve_to_this_obj_channel1
		is_stale
		?id:fixid ?rid:fixrid ?fixenc:fixenc ?close:(Some close) fixch
      in
      c # init_rep_encoding internal_encoding;
      c # init_warner swarner warner;
      (* clones <- c :: clones; *)
      (c :> resolver)

  end
;;


class resolve_to_this_obj_channel ?id ?rid ?fixenc ?close ch =
  let is_stale = ref false in
  resolve_to_this_obj_channel1 is_stale ?id ?rid ?fixenc ?close ch
;;


class resolve_to_url_obj_channel ?close 
                                 ~url_of_id ~base_url_of_id ~channel_of_url () =
  let channel_of_id rid =
    let rel_url = url_of_id rid in    (* may raise Not_competent *)
    try
      (* Now compute the absolute URL: *)
      let abs_url = 
	if Neturl.url_provides ~scheme:true rel_url then
	  rel_url
	else
	  let base_url = base_url_of_id rid in
	  Neturl.apply_relative_url base_url rel_url in
          (* may raise Malformed_URL *)

      (* Simple check whether 'abs_url' is really absolute: *)
      if not(Neturl.url_provides ~scheme:true abs_url)
      then raise Not_competent;

      let rid' =
	{ rid with 
	    rid_system = Some(Neturl.string_of_url abs_url)
	} in
      
      (* Get and return the channel: *)
      let ch, enc_opt, active_id_opt =
	channel_of_url rid' abs_url            (* may raise Not_competent *)
      in
      (ch, 
       enc_opt, 
       (match active_id_opt with
	    None -> Some rid'
	  | _    -> active_id_opt
       ))
    with
	Neturl.Malformed_URL -> raise (Not_resolvable Neturl.Malformed_URL)
      | Not_competent        -> raise (Not_resolvable Not_found)
  in
  resolve_to_any_obj_channel ?close ~channel_of_id ()
;;


let base_url_syntax =
  { Neturl.null_url_syntax with
      Neturl.url_enable_scheme = Neturl.Url_part_required;
      Neturl.url_enable_host   = Neturl.Url_part_allowed;
      Neturl.url_enable_path   = Neturl.Url_part_required;
      Neturl.url_accepts_8bits = true;
  }
;;


type spec = [ `Not_recognized | `Allowed | `Required ]

class resolve_as_file
  ?(file_prefix = (`Allowed :> spec))
  ?(host_prefix = (`Allowed :> spec))
  ?(system_encoding = `Enc_utf8)
  ?(map_private_id = (fun _ -> raise Not_competent))
  ?(open_private_id = (fun _ -> raise Not_competent))
  ?(base_url_defaults_to_cwd = false)
  ?(not_resolvable_if_not_found = true)
  ()
  =

  let url_syntax =
    let enable_if =
      function
	  `Not_recognized  -> Neturl.Url_part_not_recognized
	| `Allowed         -> Neturl.Url_part_allowed
	| `Required        -> Neturl.Url_part_required
    in
    { Neturl.null_url_syntax with
	Neturl.url_enable_scheme = enable_if file_prefix;
	Neturl.url_enable_host   = enable_if host_prefix;
	Neturl.url_enable_path   = Neturl.Url_part_required;
	Neturl.url_accepts_8bits = true;
    }
  in

  let default_base_url =
    if base_url_defaults_to_cwd then begin
      let cwd = Sys.getcwd() in
      let cwd_utf8 = 
	Netconversion.recode_string
	  ~in_enc: system_encoding
	  ~out_enc: `Enc_utf8 
	  cwd in
      let l = String.length cwd_utf8 in
      let cwd_utf8 =
	if cwd_utf8 = "" || cwd_utf8.[l-1] <> '/' then 
	  cwd_utf8 ^ "/" 
	else
	  cwd_utf8 in
      Some(Neturl.file_url_of_local_path cwd_utf8)
    end
    else
      None
  in

  let use_private_id = ref false in
  let url_of_id rid =
    let file_url_of_sysname sysname =
      (* By convention, we can assume that sysname is a URL conforming
       * to RFC 1738 with the exception that it may contain non-ASCII
       * UTF-8 characters.
       *)
      try
	Neturl.url_of_string url_syntax sysname
          (* may raise Malformed_URL *)
      with
	  Neturl.Malformed_URL -> raise Not_competent
    in
    use_private_id := false;
    let url =
      match rid.rid_system with
	  None -> 
	    ( match rid.rid_private with
		  None -> 
		    raise Not_competent
		| Some p -> 
		    let url = map_private_id p in
		    use_private_id := true;
		    url
	    )
	| Some sysname -> file_url_of_sysname sysname
    in
    let scheme =
      try Neturl.url_scheme url with Not_found -> "file" in
    let host =
      try Neturl.url_host url with Not_found -> "" in

    if scheme <> "file" then raise Not_competent;
    if host <> "" && host <> "localhost" then raise Not_competent;

    url
  in

  let base_url_of_id rid =
    match rid.rid_system_base with
	Some sysname ->
	  Neturl.url_of_string base_url_syntax sysname
      | None ->
	  ( match default_base_url with
		Some url -> url
	      | None     -> raise Not_competent
	  )
  in

  let channel_of_url rid url =
    if !use_private_id then begin
      match rid.rid_private with
	  Some p ->
	    let ch, enc_opt = open_private_id p in
	    (new input_channel ch, enc_opt, None)
	| None ->
	    assert false
    end
    else begin
      try
	let path_utf8 =
	  Neturl.local_path_of_file_url url (* may fail *)
	in
	(* Note: it is only assumed that the path is UTF-8 *)
	      
	let path =
	  Netconversion.recode_string
	    ~in_enc:  `Enc_utf8
	    ~out_enc: system_encoding
	    path_utf8 in
            (* May raise Malformed_code *)

	if (not not_resolvable_if_not_found) && not(Sys.file_exists path) then
	  raise Not_competent;
	      
	(new input_channel(open_in_bin path), None, None)
	(* May raise Sys_error *)
		
      with
	| Netconversion.Malformed_code as e ->
	    raise (Not_resolvable e)
	| Sys_error _ as e ->
	    raise (Not_resolvable e)
    end
  in

  resolve_to_url_obj_channel
    ~url_of_id
    ~base_url_of_id
    ~channel_of_url
    ()
;;


let make_file_url ?(system_encoding = `Enc_utf8) ?(enc = `Enc_utf8) filename =
  let utf8_filename =
    Netconversion.recode_string
    ~in_enc: enc
    ~out_enc: `Enc_utf8 
      filename
  in

  let getcwd() =
    let cwd = Sys.getcwd() in
    let cwd_utf8 = 
      Netconversion.recode_string
	~in_enc: system_encoding
	~out_enc: `Enc_utf8 
	cwd in
    cwd_utf8
  in

  Neturl.file_url_of_local_path ~getcwd utf8_filename
;;


class lookup_id_nonorm (catalog : (ext_id * resolver) list) =
( object (self)
    val cat = catalog
    val mutable internal_encoding = `Enc_utf8
    val mutable warner = new drop_warnings
    val mutable swarner = None
    val mutable active_resolver = None

    method init_rep_encoding enc =
      internal_encoding <- enc

    method init_warner sw w =
      swarner <- sw;
      warner <- w;

    method rep_encoding = internal_encoding
      (* CAUTION: This may not be the truth! *)

    method open_in xid =
      self # open_rid (resolver_id_of_ext_id xid)

    method open_rid rid =

      if active_resolver <> None then failwith "Pxp_reader.lookup_* # open_rid";

      let selected_xid, r =
	try
	  List.find
	    (fun (xid,r) ->
	       xid_matches_rid xid rid
	    )
	    cat
	with
	    Not_found -> raise Not_competent
      in

      let r' = r # clone in
      r' # init_rep_encoding internal_encoding;
      r' # init_warner swarner warner;
      let lb = r' # open_rid rid in   (* may raise Not_competent *)
      active_resolver <- Some (selected_xid,r');
      lb

    method close_in =
      match active_resolver with
	  None   -> ()
	| Some(_,r) -> 
	    r # close_in;
	    active_resolver <- None

    method active_id =
      match active_resolver with
	  None   -> failwith "#active_id: resolver is not open"
	| Some(selected_xid, r) -> 
	    ( match selected_xid with
		  Private p ->
		    { null_resolver with rid_private = Some p }
		| System sysid ->
		    { null_resolver with rid_system = Some sysid }
		    (* Note: Relative URLs do not make sense in catalogs,
		     * so ignore this case here
		     *)
		| Public(pubid,sysid) ->
		    { null_resolver with
			rid_public = Some pubid;
			rid_system = if sysid = "" then None else Some sysid;
		    }
		| Anonymous ->
		    assert false
	    )

(*
    method close_all =
      (* CHECK: Müssen nicht die Klone auch geschlossen werden? *)
      self # close_in
*)

    method change_encoding (enc:string) =
      match active_resolver with
	  None      -> failwith "Pxp_reader.lookup_* # change_encoding"
	| Some(_,r) -> r # change_encoding enc


    method clone =
      let c = new lookup_id_nonorm cat in
      c # init_rep_encoding internal_encoding;
      c # init_warner swarner warner;
      c
  end : resolver )
;;


class lookup_id (catalog : (ext_id * resolver) list) =
  let norm_catalog =
    (* catalog with normalized PUBLIC ids *)
    List.map 
      (fun (id,s) -> 
	 match id with
	     Public(pubid,sysid) ->
	       let norm_pubid = Pxp_aux.normalize_public_id pubid in
	       (Public(norm_pubid,sysid), s)
	   | _ ->
	       (id,s)
      )
      catalog in
  lookup_id_nonorm norm_catalog
;;


class lookup_id_as_file ?(fixenc:encoding option) catalog =
  let ch_of_id filename id =
    let ch = open_in_bin filename in  (* may raise Sys_error *)
    (new input_channel ch, fixenc, None)
  in
  let catalog' =
    List.map
      (fun (id,s) ->
	 (id, 
	  new resolve_to_any_obj_channel ~channel_of_id:(ch_of_id s) ()
	 )
      )
      catalog
  in
  lookup_id catalog'
;;


class lookup_id_as_string ?(fixenc:encoding option) catalog =
  let ch_of_id s rid =
    (new input_string s, fixenc, None)
  in
  let catalog' =
    List.map
      (fun (id,s) ->
	 (id, 
	  new resolve_to_any_obj_channel ~channel_of_id:(ch_of_id s) ()
	 )
      )
      catalog
  in
  lookup_id catalog'
;;
   

let map_public_id catalog =
  List.map (fun (pubid,x) -> (Public(pubid,""), x)) catalog
;;


let map_system_id catalog =
  List.map (fun (sysid,x) -> (System sysid, x)) catalog
;;
  

class lookup_public_id catalog =
  lookup_id (map_public_id catalog)
;;


class lookup_public_id_as_file ?fixenc catalog =
  lookup_id_as_file ?fixenc (map_public_id catalog)
;;


class lookup_public_id_as_string ?fixenc catalog =
  lookup_id_as_string ?fixenc (map_public_id catalog)
;;


class lookup_system_id catalog =
  lookup_id (map_system_id catalog)
;;


class lookup_system_id_as_file ?fixenc catalog =
  lookup_id_as_file ?fixenc (map_system_id catalog)
;;


class lookup_system_id_as_string ?fixenc catalog =
  lookup_id_as_string ?fixenc (map_system_id catalog)
;;
   

type combination_mode =
    Public_before_system
  | System_before_public
;;


class combine ?mode rl =
  object (self)
    val mode = mode
    val resolvers = (rl : resolver list)
    val mutable internal_encoding = `Enc_utf8
    val mutable warner = new drop_warnings
    val mutable swarner = None
    val mutable active_resolver = None

(*  (* needed to support close_all: *)
    val mutable clones = []
*)

    method init_rep_encoding enc =
      List.iter
	(fun r -> r # init_rep_encoding enc)
	rl;
      internal_encoding <- enc

    method init_warner sw w =
      List.iter
	(fun r -> r # init_warner sw w)
	rl;
      swarner <- sw;
      warner <- w;

    method rep_encoding = internal_encoding
      (* CAUTION: This may not be the truth! *)

    method open_in xid =
      self # open_rid (resolver_id_of_ext_id xid)

    method open_rid rid =
      let rec find_competent_resolver_for rid' rl =
	match rl with
	    r :: rl' ->
	      begin try
		r, (r # open_rid rid')
	      with
		  Not_competent -> find_competent_resolver_for rid' rl'
	      end;
	  | [] ->
	      raise Not_competent
      in

      let find_competent_resolver rl =
	match mode with
	    None -> 
	      find_competent_resolver_for rid rl
	  | Some Public_before_system ->
	      ( try
		  find_competent_resolver_for 
		    { rid with rid_system = None } rl
		with
		    Not_competent ->
		      find_competent_resolver_for 
		        { rid with rid_public = None } rl
	      )
	  | Some System_before_public ->
	      ( try
		  find_competent_resolver_for 
		    { rid with rid_public = None } rl
		with
		    Not_competent ->
		      find_competent_resolver_for 
		        { rid with rid_system = None } rl
	      )
      in

      if active_resolver <> None then failwith "Pxp_reader.combine # open_rid";
      let r, lb =
	find_competent_resolver resolvers
      in
      active_resolver <- Some r;
      lb

    method close_in =
      match active_resolver with
	  None   -> ()
	| Some r -> r # close_in;
	            active_resolver <- None

(*
    method close_all =
      List.iter (fun r -> r # close_in) clones
*)

    method change_encoding (enc:string) =
      match active_resolver with
	  None   -> failwith "Pxp_reader.combine # change_encoding"
	| Some r -> r # change_encoding enc

    method active_id =
      match active_resolver with
	  None -> failwith "#active_id: resolver not open"
	| Some r -> r # active_id

    method clone =
      let c =
	match active_resolver with
	    None   ->
	      new combine ?mode
                          (List.map (fun q -> q # clone) resolvers)
	  | Some r ->
	      let r' = r # clone in
	      new combine
		?mode
		(List.map
		   (fun q -> if q == r then r' else q # clone)
		   resolvers)
      in
      c # init_rep_encoding internal_encoding;
      c # init_warner swarner warner;
      (* clones <- c :: clones; *)
      c
  end
;;


let norm_url_syntax =
  { Neturl.null_url_syntax with
      Neturl.url_enable_scheme = Neturl.Url_part_allowed;
      Neturl.url_enable_user = Neturl.Url_part_allowed;
      Neturl.url_enable_password = Neturl.Url_part_allowed;
      Neturl.url_enable_host = Neturl.Url_part_allowed;
      Neturl.url_enable_port = Neturl.Url_part_allowed;
      Neturl.url_enable_path = Neturl.Url_part_required;
      (* rest: Url_part_not_recognized *)
      Neturl.url_accepts_8bits = true;
  }
;;


class norm_system_id (subresolver : resolver) =
object(self)
  val subresolver = subresolver
  val mutable current_rid = null_resolver  (* for rewrite_system_id *)

  method init_rep_encoding enc =
    subresolver # init_rep_encoding enc

  method init_warner sw w =
    subresolver # init_warner sw w;

  method rep_encoding =
    subresolver # rep_encoding

  method open_in xid =
    (* It is not possible to normalize the SYSTEM id of a xid *)
    subresolver # open_in xid

  method open_rid rid =
    (* (1) check that the system name is a URL
     * (2) if the URL is relative: make it absolute (use system base name)
     * (3) remove .. and . from the URL path as much as possible
     * (4) all other names are left unmodified
     *)

    let norm sysname =
      try
	(* prerr_endline ("sysname=" ^ sysname); *)
	let sysurl = Neturl.url_of_string norm_url_syntax sysname in
	let sysurl_abs =
	  if Neturl.url_provides ~scheme:true sysurl then
	    sysurl
	  else
	    match rid.rid_system_base with
		None -> 
		  (* The sysurl is relative, but we do not have a base URL.
		   * There is no way to interpret this case, so we reject
		   * it.
		   *)
		  raise Not_competent
	      | Some sysbase -> 
		  let baseurl = Neturl.url_of_string norm_url_syntax sysbase in
		  Neturl.apply_relative_url baseurl sysurl
	in
	let path = Neturl.url_path sysurl_abs in
	let path' = Neturl.norm_path path in  (* remove .., ., // *)
	let sysurl' = Neturl.modify_url ~path:path' sysurl_abs in
	(* prerr_endline ("Before rewrite: " ^ Neturl.string_of_url sysurl');
	 *)
	let sysurl'' = self # rewrite sysurl' in
	let sysname' = Neturl.string_of_url sysurl'' in
	(* prerr_endline ("sysname'=" ^ sysname'); *)
	sysname'
      with
	  Neturl.Malformed_URL ->
	    raise Not_competent
    in
      
    let rid' =
      { rid with
	  rid_system = ( match rid.rid_system with
			     None -> 
			       None
			   | Some sysname ->
			       Some(norm sysname)
		       )
      }
    in
    let lex = subresolver # open_rid rid' in
    current_rid <- rid;   (* the original, unmodified version! *)
    lex

  method private rewrite sysurl = sysurl

  method close_in =
    subresolver # close_in

(*
  method close_all =
    subresolver # close_all
*)

  method change_encoding enc =
    subresolver # change_encoding enc

  method active_id =
    subresolver # active_id

  method clone =
    let c = subresolver # clone in
    ( {< subresolver = c >} :> resolver )
end
;;


let try_to_get f arg =
  try Some(f arg) with Not_found -> None
;;


let remove_trailing_slash p =
  match p with
      [] -> []
    | [""] -> [""]
    | _ -> 
	let p' = List.rev p in
	if List.hd p' = "" then
	  List.rev(List.tl p')
	else
	  p
;;


let rec path_matches pattern p =
  match (pattern, p) with
      ( [], [] ) ->
	(* Case: pattern = p *)
	true
    | ( [""], (_::_) ) ->
	(* Case: pattern ends with slash, and is a prefix of p *)
	true
    | ( (pat :: pattern'), (p0 :: p') ) when pat = p0 ->
	path_matches pattern' p'
    | _ ->
	false
;;


let rec path_subst pattern subst p =
  match (pattern, p) with
      ( [], [] ) ->
	(* Case: pattern = p *)
	subst
    | ( [""], (_::_) ) ->
	(* Case: pattern ends with slash, and is a prefix of p *)
	(* If subst ends with a slash, remove it *)
	let subst' = remove_trailing_slash subst in
	subst' @ p
    | ( (pat :: pattern'), (p0 :: p') ) when pat = p0 ->
	path_subst pattern' subst p'
    | _ ->
	assert false  (* no match *)
;;


class rewrite_system_id ?(forward_unmatching_urls=false) rw_spec subresolver =
object(self)
  inherit norm_system_id subresolver
  val forward_unmatching_urls = forward_unmatching_urls
  val rw_spec = 
    List.map
      (fun (sysfrom, systo) ->
	 let sysfrom_url = 
	   Neturl.url_of_string norm_url_syntax sysfrom in
	 let systo_url =
	   Neturl.url_of_string norm_url_syntax systo in
	 (* if sysfrom_url ends with a slash, systo_url must end with it, too
	  *)
	 let ends_with_slash url =
	   let path = Neturl.url_path url in
	   List.hd (List.rev path) = ""
	 in
	 if ends_with_slash sysfrom_url && not(ends_with_slash systo_url) then
	   failwith "Illegal rewrite specification: Cannot map directory to non-directory";
	 (sysfrom_url, systo_url)
      )
      rw_spec

  method private rewrite url =
    try
      let sysfrom_url, systo_url =
	List.find                              (* may raise Not_found *)
	  (fun (sysfrom_url, systo_url) ->
	     (* Check whether url matches sysfrom_url *)
	     (try_to_get Neturl.url_scheme sysfrom_url =
		try_to_get Neturl.url_scheme url) &&
	     (try_to_get Neturl.url_user sysfrom_url =
		try_to_get Neturl.url_user url) &&
	     (try_to_get Neturl.url_password sysfrom_url =
		try_to_get Neturl.url_password url) &&
	     (try_to_get Neturl.url_host sysfrom_url =
		try_to_get Neturl.url_host url) &&
	     (try_to_get Neturl.url_port sysfrom_url =
		try_to_get Neturl.url_port url) &&
	     (let sysfrom_p = Neturl.url_path sysfrom_url in
	      let p = Neturl.url_path url in
	      (List.hd sysfrom_p = "") &&   (* i.e. sysfrom_p is absolute *)
	      (List.hd p = "") &&           (* i.e. p is absolute *)
	      (path_matches sysfrom_p p))
	  )
	  rw_spec
      in
      (* prerr_endline("sysfrom_url=" ^ Neturl.string_of_url sysfrom_url);
	 prerr_endline("systo_url=" ^ Neturl.string_of_url systo_url);
      *)
      let sysfrom_p = Neturl.url_path sysfrom_url in
      let systo_p = Neturl.url_path systo_url in
      let p = Neturl.url_path url in
      let p' = path_subst sysfrom_p systo_p p in
      Neturl.modify_url ~path:p' systo_url
    with
	Not_found ->
	  if forward_unmatching_urls then
	    url
	  else
	    raise Not_competent

  method active_id =
    (* hide the rewritten URL *)
    let aid = subresolver # active_id in
    { aid with
	rid_system = current_rid.rid_system;
	rid_system_base = current_rid.rid_system_base;
    }
end
;;


(**********************************************************************)
(* EMULATION OF DEPRECATED CLASSES                                    *)
(**********************************************************************)

let rec try_several f l =
  (* Applies the function f to all elements of l in turn. The function can
   * return a result value, or can raise Not_competent. The elements are
   * tried until a result value is found. If no element leads to a result,
   * the exception Not_competent is raised.
   *)
  match l with
      [] -> 
	raise Not_competent
    | x :: l' ->
	( try f x with Not_competent -> try_several f l')
;;


let xid_list_of_rid rid =
  (* Returns a list of ext_ids that are compatible to the rid *)
  (match rid.rid_private with
       Some p -> [ Private p ]
     | None   -> []
  ) @
  (match rid.rid_public, rid.rid_system with
       (Some pub, Some sys) -> [ Public(pub,sys) ]
     | (Some pub, None)     -> [ Public(pub,"") ]
     | (None, Some sys)     -> [ System(sys) ]
     | (None, None)         -> []
  ) @ 
  [ Anonymous ]
  (* We always append Anonymous, because this xid is included in all rids *)
;;


class resolve_read_any_channel ?(close=close_in) ~channel_of_id () =
  (* reduce resolve_read_any_channel to resolve_to_any_obj_channel *)
  let current_ch = ref None in
  let obj_channel_of_id rid =
    try_several
      (fun xid ->
	 let ch, enc_opt = channel_of_id xid in  
	                   (* may raise Not_competent *)
	 current_ch := Some ch;
	 (new input_channel ch,
	  enc_opt,
	  Some (resolver_id_of_ext_id xid))
      )
      (xid_list_of_rid rid)
  in
  let close_obj _ =
    match !current_ch with
	Some ch -> close ch; current_ch := None
      | None -> ()
  in

  resolve_to_any_obj_channel 
    ~close:close_obj
    ~channel_of_id:obj_channel_of_id
    ()
;;


class resolve_read_this_channel ?id ?fixenc ?close ch =
  (* reduce resolve_read_this_channel to resolve_to_this_obj_channel *)
  let obj_ch =
    new input_channel ch in
  let close_obj _ =
    match close with
	Some f -> f ch
      | None -> ()
  in
  resolve_to_this_obj_channel ?id ?fixenc ~close:close_obj obj_ch
;;


class resolve_read_any_string ~string_of_id () =
  (* reduce resolve_read_any_string to resolve_to_any_obj_channel *)
  let obj_channel_of_id rid =
    try_several
      (fun xid ->
	 let s, enc_opt = string_of_id xid in  
	                   (* may raise Not_competent *)
	 (new input_string s,
	  enc_opt,
	  Some(resolver_id_of_ext_id xid))
      )
      (xid_list_of_rid rid)
  in
  resolve_to_any_obj_channel ~channel_of_id:obj_channel_of_id ()
;;


class resolve_read_this_string ?id ?fixenc s =
  let string_of_id tried_xid =
    match id with
	None ->
	  (* Open always! *)
	  (s, fixenc)
      | Some my_xid ->
	  if my_xid = tried_xid then
	    (s, fixenc)
	  else
	    raise Not_competent
  in
  resolve_read_any_string ~string_of_id ()
;;


class resolve_read_url_channel
  (* reduce resolve_read_url_channel to resolve_to_url_obj_channel *)
  ?(base_url = Neturl.null_url)
  ?(close = close_in)
  ~url_of_id
  ~channel_of_url
  () =
  let current_ch = ref None in
  let current_xid = ref Anonymous in
  let url_of_id' rid =
    try_several 
      (fun xid ->
	 let url = url_of_id xid  in (* or Not_competent *)
	 current_xid := xid;
	 url
      )
      (xid_list_of_rid rid) in
  let base_url_of_id rid =
    ( match rid.rid_system_base with
	  Some sys -> Neturl.url_of_string base_url_syntax sys
	| None     -> raise Not_competent
    ) in
  let channel_of_url' rid url =
    let ch, enc_opt = channel_of_url !current_xid url in  
	              (* may raise Not_competent *)
    current_ch := Some ch;
    let active_id = resolver_id_of_ext_id !current_xid in
    let active_id' = { active_id with
			 rid_system = rid.rid_system
		     } in
    (new input_channel ch, enc_opt, Some active_id')
  in
  let close_obj _ =
    match !current_ch with
	Some ch -> close ch; current_ch := None
      | None -> ()
  in

  resolve_to_url_obj_channel
    ~close:close_obj
    ~url_of_id:url_of_id'
    ~base_url_of_id
    ~channel_of_url:channel_of_url'
    ()
;;


let lookup_public_id_as_file ?fixenc catalog =
  new lookup_public_id_as_file ?fixenc catalog ;;

let lookup_public_id_as_string ?fixenc catalog =
  new lookup_public_id_as_string ?fixenc catalog ;;

let lookup_system_id_as_file ?fixenc catalog =
  new lookup_system_id_as_file ?fixenc catalog ;;

let lookup_system_id_as_string ?fixenc catalog =
  new lookup_system_id_as_string ?fixenc catalog ;;





This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml