Plasma GitLab Archive
Projects Blog Knowledge

(* $Id: pxp_reader.mli 689 2004-08-07 17:01:52Z gerd $
 * ----------------------------------------------------------------------
 * PXP: The polymorphic XML parser for Objective Caml.
 * Copyright by Gerd Stolpmann. See LICENSE for details.
 *)

(* Purpose of this module: The Pxp_reader module allows you to exactly
 * specify how external identifiers (SYSTEM or PUBLIC) are mapped to
 * files or channels. This is normally only necessary for advanced
 * configurations, as the functions from_file, from_channel, and
 * from_string in Pxp_types often suffice.
 *
 * There are two ways to use this module. First, you can compose the
 * desired behaviour by combining several predefined resolver objects
 * or functions. See the example section at the end of the file.
 * Second, you can inherit from the classes (or define a resolver class
 * from scratch). I hope this is seldom necessary as this way is much
 * more complicated; however it allows you to implement any magic.
 *)


open Pxp_core_types;;

exception Not_competent;;
  (* Raised by the 'open_in' method if the object does not know how to
   * handle the passed external ID.
   *)

exception Not_resolvable of exn;;
  (* Indicates that one resolver was competent, but there was an error
   * while resolving the external ID. The passed exception explains the
   * reason.
   * Not_resolvable(Not_found) serves as indicator for an unknown reason.
   *)

(* One must only use either [lsrc_lexbuf], or [lsrc_unicode_lexbuf] ! *)
type lexer_source =
    { lsrc_lexbuf : Lexing.lexbuf Lazy.t;
      lsrc_unicode_lexbuf : Netulex.ULB.unicode_lexbuf Lazy.t;
    }


(* The class type 'resolver' is the official type of all "resolvers".
 * Resolvers take file names (or better, external identifiers) and
 * return lexbufs, scanning the file for tokens. Resolvers may be
 * cloned, and clones can interpret relative file names relative to
 * their creator.
 *
 * Example of the latter:
 *
 * Resolver r reads from file:/dir/f1.xml
 *
 * <tag>some XML text
 * &e;                       -----> Entity e is bound to "subdir/f2.xml"
 * </tag>                           Step (1): let r' = "clone of r"
 *                                  Step (2): open file "subdir/f2.xml"
 *
 * r' must still know the directory of the file r is reading, otherwise
 * it would not be able to resolve "subdir/f2.xml" = "file:/dir/subdir/f2.xml".
 *
 * Actually, this example can be coded as:
 *
 * let r = new resolve_as_file in
 * let lbuf = r # open_in "file:/dir/f1.xml" in
 * ... read from lbuf ...
 * let r' = r # clone in
 * let lbuf' = r' # open_in "subdir/f2.xml" in
 * ... read from lbuf' ...
 * r' # close_in;
 * ... read from lbuf ...
 * r # close_in;
 *)

class type resolver =
  object
    (* A resolver can open an input source, and returns this source as
     * Lexing.lexbuf.
     *
     * After creating a resolver, one must invoke the two methods
     * init_rep_encoding and init_warner to set the internal encoding of
     * strings and the warner object, respectively. This is normally
     * done by the parsing functions in Pxp_yacc.
     * It is not necessary to invoke these two methods for a fresh
     * clone.
     *
     * It is possible that the character encoding of the source and the
     * internal encoding of the parser are different. To cope with this,
     * one of the tasks of the resolver is to recode the characters of
     * the input source into the internal character encoding.
     *
     * Note that there are several ways of determining the encoding of the
     * input: (1) It is possible that the transport protocol (e.g. HTTP)
     * transmits the encoding, and (2) it is possible to inspect the beginning
     * of the file, and to analyze:
     * (2.1) The first two bytes indicate whether UTF-16 is used
     * (2.2) Otherwise, one can assume that an ASCII-compatible character
     *       set is used. It is now possible to read the XML declaration
     *       <?xml ... encoding="xyz" ...?>. The encoding found here is
     *       to be used.
     * (2.3) If the XML declaration is missing, the encoding is UTF-8.
     * The resolver needs only to distinguish between cases (1), (2.1),
     * and the rest.
     * The details of analyzing whether (2.2) or (2.3) applies are programmed
     * elsewhere, and the resolver will be told the result (see below).
     *
     * A resolver is like a file: it must be opened before one can work
     * with it, and it should be closed after all operations on it have been
     * done. The method 'open_rid' is called with the resolver ID as argument
     * and it must return the lexbuf reading from the external resource.
     * (There is also the old method 'open_in' that expects an ext_id as
     * argument. It is less powerful and should not be used any longer.)
     * The method 'close_in' does not require an argument.
     *
     * It is allowed to re-open a resolver after it has been closed. It is
     * forbidden to open a resolver again while it is open.
     * It is allowed to close a resolver several times: If 'close_in' is
     * invoked while the resolver is already closed, nothing happens.
     *
     * The method 'open_rid' may raise Not_competent to indicate that this
     * resolver is not able to open this type of IDs.
     *
     * If 'open_rid' gets a PUBLIC ID, it can be assumed that the string
     * is already normalized (whitespace).
     *
     * The method 'change_encoding' is called from the parser after the
     * analysis of case (2) has been done; the argument is either the
     * string name of the encoding, or the empty string to indicate
     * that no XML declaration was found. It is guaranteed that
     * 'change_encoding' is invoked after only a few tokens of the
     * file. The resolver should react as follows:
     * - If case (1) applies:   Ignore the encoding passed to 'change_encoding'.
     * - If case (2.1) applies: The encoding passed to 'change_encoding' must
     *                          be compatible with UTF-16. This should be
     *                          checked, and violations should be reported.
     * - Else:                  If the passed encoding is "", assume UTF-8.
     *                          Otherwise, assume the passed encoding.
     *
     * The following rule helps synchronizing the lexbuf with the encoding:
     * If the resolver has been opened, but 'change_encoding' has not yet
     * been invoked, the lexbuf contains at most one character (which may
     * be represented by multiple bytes); i.e. the lexbuf is created by
     * Lexing.from_function, and the function puts only one character into
     * the buffer at once.
     * After 'change_encoding' has been invoked, there is no longer a limit
     * on the lexbuf size.
     *
     * The reason for this rule is that you know exactly the character where
     * the encoding changes to the encoding passed by 'change_encoding'.
     *
     * The method 'clone' may be invoked for open or closed resolvers.
     * Basically, 'clone' returns a new resolver which is always closed.
     * If the original resolver is closed, the clone is simply a clone.
     * If the original resolver is open at the moment of cloning:
     * If the clone is later opened for a relative system ID (i.e. relative
     * URL), the clone must interpret this ID relative to the ID of the
     * original resolver.
     *)
    method init_rep_encoding : rep_encoding -> unit
    method init_warner : symbolic_warnings option -> collect_warnings -> unit

    method rep_encoding : rep_encoding

    method open_in : ext_id -> lexer_source
      (* This is the old method to open a resolver. It is superseded by
       * open_rid.
       * This method may raise Not_competent if the object does not know
       * how to handle this ext_id.
       *
       * PXP 1.2: Returns now a lexer_source, no longer a lexbuf
       *)

    method open_rid : resolver_id -> lexer_source
      (* This is the new method to open a resolver. It takes a resolver ID
       * instead of an ext_id but works in the same way.
       *
       * PXP 1.2: Returns now a lexer_source, no longer a lexbuf
       *)

    method close_in : unit
    method change_encoding : string -> unit


    (* Every resolver can be cloned. The clone does not inherit the connection
     * with the external object, i.e. it is initially closed.
     *)
    method clone : resolver

    method active_id : resolver_id
      (* Returns the actually used resolver ID. This is the ID passed to
       * open_rid where unused components have been set to None. The
       * resolver ID returned by [active_id] plays an important role when
       * expanding relative URLs.
       *)

(*  method close_all : unit *)
      (* Closes this resolver and every clone *)
      (* This method is no longer supported in PXP 1.2 *)

  end
;;


(* The next classes are resolvers for concrete input sources. *)

(* CHANGES IN PXP 1.2:
 *
 * All resolve_read_* classes are now deprecated. The new classes 
 * resolve_to_* base on the Netchannels classes as generalization of
 * input streams.
 *
 * Examples: To read from an in_channel, use:
 *
 *   let obj_channel = new Netchannels.input_channel in_channel in
 *   new Pxp_reader.resolve_to_this_obj_channel obj_channel
 *
 * To read from a string, use:
 *
 *   let obj_channel = new Netchannels.input_string string in
 *   new Pxp_reader.resolve_to_this_obj_channel obj_channel
 *
 * Furthermore, the new classes use the resolver_id record as generalized
 * names for entities. This solves most problems with relative URLs.
 *
 * The "Anonymous" ID: In previous versions of PXP, a resolver bound to
 * the Anonymous ID matched the Anonymous ID. This is no longer true.
 * The algebra has been changed such that Anonymous never matches, not 
 * even itself.
 * 
 *   Example: The new resolver
 *     let r = new resolve_to_this_obj_channel ~id:Anonymous ch 
 *   will never accept any ID. In contrast to this, the old, and now
 *   deprecated resolver
 *     let r' = new resolve_read_this_channel ~id:Anonymous ch
 *   accepted the ID Anonymous in previous versions of PXP.
 *
 * The rationale behind this change is that Anonymous acts now like 
 * an "empty set", and not like a concrete element. You can use Private
 * to create as many concrete elements as you want, so there is actually
 * no need for the old behaviour of Anonymous.
 *
 * Note that even the resolver classes provided for backwards compatibility
 * implement this change (to limit the confusion). This means that you
 * might have to change your application to use Private instead of 
 * Anonymous.
 * 
 *)

class resolve_to_this_obj_channel :
  ?id:ext_id ->
  ?rid:resolver_id ->
  ?fixenc:encoding ->
  ?close:(Netchannels.in_obj_channel -> unit) ->
 Netchannels.in_obj_channel -> 
   resolver;;

  (* Reads from the passed in_obj_channel. If the ~id or ~rid arguments
   * are passed to the object, the created resolver accepts only
   * these IDs (all mentioned private, system, or public IDs). Otherwise 
   * all IDs are accepted, even Anonymous.
   *
   * This resolver can only be used once (because the in_obj_channel
   * can only be used once). If it is opened a second time (either
   * in the base object or a clone), it will raise Not_competent.
   *
   * If you pass the ~fixenc argument, the encoding of the channel is
   * set to the passed value, regardless of any auto-recognition or
   * any XML declaration.
   *
   * When the resolver is closed, the function passed by the ~close
   * argument is called. By default, the channel is closed
   * (i.e. the default is: ~close:(fun ch -> ch # close_in)).
   *)

type accepted_id =
    Netchannels.in_obj_channel * encoding option * resolver_id option
  (* When a resolver accepts an ID, this triple specifies how to proceed.
   * The in_obj_channel is the channel to read data from, the encoding option
   * may enforce a certain character encoding, and the resolver_id option
   * may detail the ID (this ID will be returned by active_id).
   *
   * If None is passed as encoding option, the standard autodetection of
   * the encoding is performed.
   *
   * If None is passed as resolver_id option, the original ID is taken
   * unchanged.
   *)

class resolve_to_any_obj_channel :
  ?close:(Netchannels.in_obj_channel -> unit) ->
  channel_of_id:(resolver_id -> accepted_id) ->
  unit ->
  resolver;;

  (* This resolver calls the function channel_of_id to open a new channel for
   * the passed resolver_id. This function must either return the accepted_id,
   * or it must fail with Not_competent.
   *
   * When the resolver is closed, the function passed by the ~close
   * argument is called. By default, the channel is closed
   * (i.e. the default is: ~close:(fun ch -> ch # close_in)).
   *)


class resolve_to_url_obj_channel : 
  ?close:(Netchannels.in_obj_channel -> unit) ->
  url_of_id:(resolver_id -> Neturl.url) ->
  base_url_of_id:(resolver_id -> Neturl.url) ->
  channel_of_url:(resolver_id -> Neturl.url -> accepted_id) ->
  unit ->
    resolver;;

  (*
   * When this resolver gets an ID to read from, it calls the function
   * ~url_of_id to get the corresponding URL (such IDs are normally 
   * system IDs, but it is also possible to map system IDs to URLs). 
   * This URL may be a relative URL; however, a URL scheme must be used
   * which contains a path. The resolver converts the URL to an absolute 
   * URL if necessary.
   *
   * To do so, the resolver calls ~base_url_of_id to get the URL the relative
   * URL must be interpreted relative to. Usually, this function returns
   * the rid_system_base as URL. This URL must be absolute.
   *
   * The third function, ~channel_of_url, is fed with the absolute URL
   * as input. This function opens the resource to read from, and returns
   * the accepted_id like resolve_to_any_obj_channel does. The resolver ID 
   * passed to ~channel_of_url contains the string representation of the
   * absolute URL as system ID.
   *
   * Both functions, ~url_of_id and ~channel_of_url, can raise
   * Not_competent to indicate that the object is not able to read from
   * the specified resource. However, there is a difference: A Not_competent
   * from ~url_of_id is left as it is, but a Not_competent from ~channel_of_url
   * is converted to Not_resolvable. So only ~url_of_id decides which URLs
   * are accepted by the resolver and which not, and in the latter case,
   * other resolver can be tried. If ~channel_of_url raises Not_competent,
   * the whole resolution procedure will stop, and no other resolver will
   * be tried.
   *
   * When the resolver is closed, the function passed by the ~close
   * argument is called. By default, the channel is closed
   * (i.e. the default is: ~close:(fun ch -> ch # close_in())).
   *)


class resolve_as_file :
  ?file_prefix:[ `Not_recognized | `Allowed | `Required ] ->
  ?host_prefix:[ `Not_recognized | `Allowed | `Required ] ->
  ?system_encoding:encoding ->
  ?map_private_id:  (private_id -> Neturl.url) ->
  ?open_private_id: (private_id -> in_channel * encoding option) ->
  ?base_url_defaults_to_cwd: bool ->
  ?not_resolvable_if_not_found:bool ->
  unit ->
  resolver;;

  (* Reads from the local file system. Every file name is interpreted as
   * file name of the local file system, and the referred file is read.
   *
   * The full form of a file URL is: file://host/path, where
   * 'host' specifies the host system where the file identified 'path'
   * resides. host = "" or host = "localhost" are accepted; other values
   * will raise Not_competent. The standard for file URLs is
   * defined in RFC 1738.
   *
   * Option ~file_prefix: Specifies how the "file:" prefix of file names
   * is handled:
   * `Not_recognized:  The prefix is not recognized.
   * `Allowed:         The prefix is allowed but not required (the default).
   * `Required:        The prefix is required.
   *
   * Option ~host_prefix: Specifies how the "//host" phrase of file names
   * is handled:
   * `Not_recognized:  The phrase is not recognized.
   * `Allowed:         The phrase is allowed but not required (the default).
   * `Required:        The phrase is required.
   *
   * Option ~system_encoding: Specifies the encoding of file names of
   * the local file system. Default: UTF-8.
   *
   * Options ~map_private_id and ~open_private_id: THESE OPTIONS ARE
   * DEPRECATED! IT IS NOW POSSIBLE TO USE A COMBINED RESOLVER TO ACHIEVE
   * THE SAME EFFECT! - These must always be
   * used together. They specify an exceptional behaviour in case a private
   * ID is to be opened. map_private_id maps the private ID to an URL
   * (or raises Not_competent). However, instead of opening the URL 
   * the function open_private_id is called to get an in_channel to read
   * from and to get the character encoding. The URL is taken into account
   * when subsequently relative SYSTEM IDs must be resolved.
   *
   * Option ~base_url_defaults_to_cwd: If true, relative URLs
   * are interpreted relative to the current working directory at the time
   * the class is instantiated, but only if there is no parent URL, i.e.
   * rid_system_base=None. If false (the default), such URLs cannot be resolved.
   * In general, it is better to set this option to false, and to
   * initialize rid_system_base properly.
   *
   * Option ~not_resolvable_if_not_found: If true (the default), 
   * "File not found" errors stop the resolution process. If false,
   * "File not found" is treated as [Not_competent].
   *)

val make_file_url :
  ?system_encoding:encoding ->
  ?enc:encoding ->
  string ->
    Neturl.url;;

(* This is a convenience function to create a file URL (for localhost).
 * The argument is the file name encoded in the character set enc.
 * Relative file names are automatically converted to absolute names
 * by prepending Sys.getcwd() to the passed file name.
 *
 * ~system_encoding: Specifies the encoding of file names of
 *     the local file system. Default: UTF-8. (This argument is
 *     necessary to interpret Sys.getcwd() correctly.)
 * ~enc: The encoding of the passed string. Defaults to `Enc_utf8
 *
 * Note: To get a string representation of the URL, apply
 * Neturl.string_of_url to the result.
 *)



(* The following classes and functions create resolvers for catalogs
 * of PUBLIC or SYSTEM identifiers.
 *)

class lookup_id :
  (ext_id * resolver) list ->    (* catalog *)
    resolver;;
  (* The general catalog class. The catalog argument specifies pairs (xid,r)
   * mapping external IDs xid to subresolvers r. The subresolver is invoked
   * if an entity with the corresponding xid is to be opened.
   *
   * Note: SYSTEM IDs are simply compared literally, without making
   * relative IDs absolute. See norm_system_id below for this function.
   *)


class lookup_id_as_file :
  ?fixenc:encoding ->
  (ext_id * string) list ->      (* catalog *)
    resolver;;

  (* The catalog argument specifies pairs (xid,file) mapping external IDs xid
   * to files. The file is read  if an entity with the corresponding xid is
   * to be opened.
   *
   * Note: SYSTEM IDs are simply compared literally, without making
   * relative IDs absolute. See norm_system_id below for this function.
   *
   * ~fixenc: Overrides the encoding of the file contents. By default, the
   *     standard rule is applied to find out the encoding of the file.
   *)


class lookup_id_as_string :
  ?fixenc:encoding ->
  (ext_id * string) list ->      (* catalog *)
    resolver;;

  (* The catalog argument specifies pairs (xid,s) mapping external IDs xid
   * to strings s. The string is read if an entity with the corresponding
   * xid is to be opened.
   *
   * Note: SYSTEM IDs are simply compared literally, without making
   * relative IDs absolute. See norm_system_id below for this function.
   *)


class lookup_public_id :
  (string * resolver) list ->    (* catalog *)
    resolver;;

  (* This is the generic builder for PUBLIC id catalog resolvers: The catalog 
   * argument specifies pairs (pubid, r) mapping PUBLIC identifiers to
   * subresolvers.
   *
   * The subresolver is invoked if an entity with the corresponding PUBLIC
   * id is to be opened.
   *)



class lookup_public_id_as_file :
  ?fixenc:encoding ->
  (string * string) list ->     (* catalog *)
    resolver;;

  (* Makes a resolver for PUBLIC identifiers. The catalog argument specifies
   * pairs (pubid, filename) mapping PUBLIC identifiers to filenames. The
   * filenames must already be encoded in the character set the system uses
   * for filenames.
   *
   * Note: This class does not enable the resolution of inner IDs of PUBLIC
   * entities by relative SYSTEM names. To get this effect, use
   * the class lookup_id, and feed it with combined 
   * Public(pubid,sysid) identifiers. In this case, the entity has both
   * a PUBLIC and a SYSTEM ID, and resolution of inner relative SYSTEM
   * names works.
   *
   * ~fixenc: Overrides the encoding of the file contents. By default, the
   *     standard rule is applied to find out the encoding of the file.
   *)


class lookup_public_id_as_string :
  ?fixenc:encoding ->
  (string * string) list ->    (* catalog *)
    resolver;;

  (* Makes a resolver for PUBLIC identifiers. The catalog argument specifies
   * pairs (pubid, text) mapping PUBLIC identifiers to XML text (which must
   * begin with <?xml ...?>).
   *
   * ~fixenc: Overrides the encoding of the strings.
   *)


class lookup_system_id :
  (string * resolver) list ->    (* catalog *)
    resolver;;

  (* This is the generic builder for SYSTEM id catalog resolvers: The catalog 
   * argument specifies pairs (sysid, r) mapping SYSTEM identifiers to 
   * subresolvers.
   * The subresolver is invoked if an entity with the corresponding SYSTEM
   * id is to be opened.
   *
   * Important note: Two SYSTEM IDs are considered as equal if they are
   * equal in their string representation. (This may not what you want
   * and may cause trouble... However, I currently do not know how to
   * implement a "semantical" comparison logic.)
   *
   * Note: SYSTEM IDs are simply compared literally, without making
   * relative IDs absolute. See norm_system_id below for this function.
   *)


class lookup_system_id_as_file :
  ?fixenc:encoding ->
  (string * string) list ->     (* catalog *)
    resolver;;

  (* Looks up resolvers for SYSTEM identifiers: The catalog argument specifies
   * pairs (sysid, filename) mapping SYSTEM identifiers to filenames. The
   * filenames must already be encoded in the character set the system uses
   * for filenames.
   *
   * Note: SYSTEM IDs are simply compared literally, without making
   * relative IDs absolute. See norm_system_id below for this function.
   *
   * ~fixenc: Overrides the encoding of the file contents. By default, the
   *     standard rule is applied to find out the encoding of the file.
   *)


class lookup_system_id_as_string :
  ?fixenc:encoding ->
  (string * string) list ->     (* catalog *)
    resolver;;

  (* Looks up resolvers for SYSTEM identifiers: The catalog argument specifies
   * pairs (sysid, text) mapping SYSTEM identifiers to XML text (which must
   * begin with <?xml ...?>).
   *
   * Note: SYSTEM IDs are simply compared literally, without making
   * relative IDs absolute. See norm_system_id below for this function.
   *
   * ~fixenc: Overrides the encoding of the strings.
   *)


class norm_system_id : resolver -> resolver
  (* Normalizes the SYSTEM ID, and forwards the open request to the
   * passed resolver.
   *
   * Normalization includes:
   * - Relative URLs are made absolute. If this fails, the problematic
   *   relative URL will be rejected.
   * - .. and . and // in the middle of URLs are removed 
   * - Escaping of reserved characters is normalized
   *
   * Normalization is recommended for catalogs, e.g.
   * new norm_system_id
   *   (new lookup_system_id_as_file
   *      [ "http://h/p1", ...;
   *        "http://h/p2", ...;
   *      ])
   * First, the catalog now even works if the URL is written in an
   * unsual way, e.g. http://h/p1/../p2, or http://h/p%31. 
   * Second, relative URLs can be used. For instance, the document
   * referred to as http://h/p1 can now refer to the other document
   * as p2.
   *)


class rewrite_system_id :
        ?forward_unmatching_urls:bool ->
	(string * string) list ->
	resolver ->
	  resolver
  (* Rewrites the SYSTEM URL according to the list of pairs. The left
   * component is the pattern, the right component is the substitute.
   * For example,
   *
   * new rewrite_system_id
   *       [ "http://host/foo/", "file:///dir/" ]
   *       r
   *
   * rewrites all URLs beginning with http://host/foo/ to file:///dir/,
   * e.g. http://host/foo/x becomes file:///dir/x.
   *
   * If the pattern ends with a slash (as in the example), a prefix match
   * is performed, i.e. the whole directory hierarchy is rewritten.
   * If the pattern does not end with a slash, an exact match is performed,
   * i.e. only a single URL is rewritten.
   *
   * The class normalizes URLs as norm_system_id does, before the match
   * is tried.
   *
   * By default, URLs that do not match any pattern are rejected
   * (Not_competent).
   *
   * The rewritten URL is only visible within the passed subresolver.
   * If the opened entity accesses other entities by relative URLs,
   * these will be resolved relative to the original URL as it was before
   * rewriting it. This gives some protection against unwanted accesses.
   * For example, if you map http://host/contents to file:///data/contents,
   * it will not be possible to access files outside this directory,
   * even if tricks are used like opening ../../etc/passwd relative to
   * http://host/contents.  Of course, this protection works only if
   * the resolver opening the file is a subresolver of rewrite_system_id.
   *
   * CHECK: Does this really work?
   *
   * Another application of this class is to use the identity as rewriting
   * rule. This resolver
   * 
   * new rewrite_system_id
   *       [ "file:///data/", "file:///data/" ]
   *       ( new resolve_as_file() )
   *
   * has the effect that only files under /data can be accessed, and
   * other such as /etc/passwd cannot.
   *
   * Option ~forward_unmatching_urls: If true, URLs that do not match any
   *   pattern are forwarded to the inner resolver. These URLs are not
   *   rewritten. NOTE THAT THE MENTIONED ACCESS RESTRICTIONS USUALLY DO
   *   NOT WORK ANYMORE IF THIS OPTION IS TURNED ON.
   *)


type combination_mode =
    Public_before_system    (* Try public identifiers first *)
  | System_before_public    (* Try system identifiers first *)
;;


class combine : 
	?mode:combination_mode ->
	resolver list -> 
	  resolver;;

  (* Combines several resolver objects. If a concrete entity with an
   * ext_id is to be opened, the combined resolver tries the contained
   * resolvers in turn until a resolver accepts opening the entity
   * (i.e. it does not raise Not_competent on open_rid).
   *
   * If the entity to open has several names, e.g. a public name and
   * a system name, these names are tried in parallel by default (this
   * is possible in the PXP 1.2 model). For backward compatibility, the
   * ~mode argument allows one to specify a different order:
   *
   * (1) Try first to open as public identifier, and if that fails,
   *     fall back to the system identifier (Public_before_system)
   * (2) Try first to open as system identifier, and if that fails,
   *     fall back to the public identifier (System_before_public)
   *
   * Clones: If the 'clone' method is invoked before 'open_rid', all contained
   * resolvers are cloned and again combined. If the 'clone' method is
   * invoked after 'open_rid' (i.e. while the resolver is open), only the
   * active resolver is cloned.
   *)


(* ====================================================================== *)

(* TODO: The following examples recommend deprecated classes. *)

(* EXAMPLES OF RESOLVERS:
 *
 * let r1 = new resolve_as_file ()
 *   - r1 can open all local files
 *
 * let r2 = new resolve_read_this_channel
 *            ~id:(System "file:/dir/f.xml")
 *            (open_in "/dir/f.xml")
 *   - r2 can only read /dir/f.xml of the local file system. If this file
 *     contains references to other files, r2 will fail.
 *     Note that the channel is automatically closed after XML parsing
 *     is done.
 *
 * let r3 = new combine [ r2; r1 ]
 *   - r3 reads /dir/f.xml of the local file system by calling r2, and all
 *     other files by calling r1. However, inner references within 
 *     /dir/f.xml still fail.
 *
 * let pid = Pxp_types.allocate_private_id() in
 * let r4 = new resolve_read_this_channel 
 *                ~id:(Private pid) 
 *                (open_in "/dir/f.xml")
 *   - r4 can only read from a so-called private ID. These are opaque
 *     identifiers that can be mapped to channels and files as needed.
 *     They do not have a textual representation, and they cannot be
 *     referred to from XML text.
 *
 * ----------------------------------------------------------------------
 * 
 * Now a bigger example. The task is to:
 *  - resolve the PUBLIC IDs P and Q to some files;
 *  - resolve the SYSTEM ID "http://r/s.dtd" to another file;
 *  - resolve all file SYSTEM IDs
 *  - start parsing with "f.xml" in the current directory
 *
 * let r =
 *   new combine 
 *     [ lookup_public_id_as_file 
 *         [ "P", "file_for_p";   "Q", "file_for_q" ];
 *       lookup_system_id_as_file
 *         [ "http://r/s.dtd", "file_for_this_dtd" ];
 *       new resolve_as_file()
 *     ]
 * in
 * (* The recommended way to create the start_id from file names: *)
 * let start_url =
 *   make_file_url "f.xml" in
 * let start_id = 
 *   System (Neturl.string_of_url url) in
 * let source = ExtID(start_id, r) in
 * parse_document_entity ... source ...
 *
 * ----------------------------------------------------------------------
 *
 * A variation:
 *
 *  - resolve the PUBLIC IDs P and Q to some files;
 *  - resolve the SYSTEM ID "http://r/s.dtd" to another file;
 *  - do not resolve any file URL
 *  - start parsing with "f.xml" in the current directory
 *
 * let start_id = allocate_private_id() in
 * let r =
 *   new combine 
 *     [ lookup_public_id_as_file 
 *         [ "P", "file_for_p";   "Q", "file_for_q" ];
 *       lookup_system_id_as_file
 *         [ "http://r/s.dtd", "file_for_this_dtd" ];
 *       resolve_read_any_channel
 *         ~channel_of_id: (fun xid ->
 *            if xid = start_id then 
 *              open_in_bin "f.xml", None  (* you may want to catch Sys_error *)
 *            else raise Not_competent)
 *         ();
 *     ]
 * in
 * let source = ExtID(start_id, r) in
 * parse_document_entity ... source ...
 *
 * ----------------------------------------------------------------------
 *
 * Three further examples can be found in the source of Pxp_yacc (file
 * pxp_yacc.m2y): the implementations of from_file, from_channel, and
 * from_string are also applications of the Pxp_reader objects.
 *)

(**********************************************************************)
(* DEPRECATED CLASSES                                                 *)
(**********************************************************************)

class resolve_read_this_channel :
  ?id:ext_id -> ?fixenc:encoding -> ?close:(in_channel -> unit) ->
  in_channel -> resolver;;

  (* THIS CLASS IS DEPRECATED! USE resolve_to_this_obj_channel INSTEAD!
   *)

  (* Reads from the passed channel (it may be even a pipe). If the ~id
   * argument is passed to the object, the created resolver accepts only
   * this ID (except Anonymous). Otherwise all IDs are accepted, even
   * Anonymous.
   * Once the resolver has been cloned, it does not accept any ID. This
   * means that this resolver cannot handle inner references to external
   * entities. Note that you can combine this resolver with another resolver
   * that can handle inner references (such as resolve_as_file); see
   * class 'combine' below.
   * If you pass the ~fixenc argument, the encoding of the channel is
   * set to the passed value, regardless of any auto-recognition or
   * any XML declaration.
   * When the resolver is closed, the function passed by the ~close
   * argument is called. By default, the channel is closed
   * (i.e. the default is: ~close:close_in).
   *)


class resolve_read_any_channel :
  ?close:(in_channel -> unit) ->
  channel_of_id:(ext_id -> (in_channel * encoding option)) ->
  unit ->
  resolver;;

  (* THIS CLASS IS DEPRECATED! USE resolve_to_any_obj_channel INSTEAD!
   *
   * Note: The function channel_of_id may be called several times to find
   * out the right ext_id from the current resolver_id. The first result
   * is taken that is not Not_competent.
   *)

  (* resolve_read_any_channel f_open ():
   * This resolver calls the function f_open to open a new channel for
   * the passed ext_id. This function must either return the channel and
   * the encoding, or it must fail with Not_competent.
   * The function must return None as encoding if the default mechanism to
   * recognize the encoding should be used. It must return Some e if it is
   * already known that the encoding of the channel is e.
   * When the resolver is closed, the function passed by the ~close
   * argument is called. By default, the channel is closed
   * (i.e. the default is: ~close:close_in).
   *)

class resolve_read_url_channel :
  ?base_url:Neturl.url ->
  ?close:(in_channel -> unit) ->
  url_of_id:(ext_id -> Neturl.url) ->
  channel_of_url:(ext_id -> Neturl.url -> (in_channel * encoding option)) ->
  unit ->
    resolver;;

  (* THIS CLASS IS DEPRECATED! USE resolve_to_url_obj_channel INSTEAD!
   *
   * Note: The function url_of_id may be called several times to find
   * out the right ext_id from the current resolver_id. The first result
   * is taken that is not Not_competent.
   *
   * Note: The optional argument base_url is ignored. The class uses always
   * the rid_system_base string to interpret relative URLs.
   *)

  (* resolve_read_url_channel url_of_id channel_of_url ():
   *
   * When this resolver gets an ID to read from, it calls the function
   * ~url_of_id to get the corresponding URL. This URL may be a relative
   * URL; however, a URL scheme must be used which contains a path.
   * The resolver converts the URL to an absolute URL if necessary.
   * The second function, ~channel_of_url, is fed with the absolute URL
   * as input. This function opens the resource to read from, and returns
   * the channel and the encoding of the resource.
   *
   * Both functions, ~url_of_id and ~channel_of_url, can raise
   * Not_competent to indicate that the object is not able to read from
   * the specified resource. However, there is a difference: A Not_competent
   * from ~url_of_id is left as it is, but a Not_competent from ~channel_of_url
   * is converted to Not_resolvable. So only ~url_of_id decides which URLs
   * are accepted by the resolver and which not.
   *
   * The function ~channel_of_url must return None as encoding if the default
   * mechanism to recognize the encoding should be used. It must return
   * Some e if it is already known that the encoding of the channel is e.
   *
   * When the resolver is closed, the function passed by the ~close
   * argument is called. By default, the channel is closed
   * (i.e. the default is: ~close:close_in).
   *
   * [Does not apply to current implementation but to former ones:]
   * Objects of this class contain a base URL relative to which relative
   * URLs are interpreted. When creating a new object, you can specify
   * the base URL by passing it as ~base_url argument. When an existing
   * object is cloned, the base URL of the clone is the URL of the original
   * object.
   *
   * Note that the term "base URL" has a strict definition in RFC 1808.
   *)


class resolve_read_this_string :
  ?id:ext_id -> ?fixenc:encoding -> string -> resolver;;

  (* THIS CLASS IS DEPRECATED! USE resolve_to_this_obj_channel INSTEAD!
   *)

  (* Reads from the passed string. If the ~id
   * argument is passed to the object, the created resolver accepts only
   * this ID (except Anonymous). Otherwise all IDs are accepted, even
   * Anonymous.
   * Once the resolver has been cloned, it does not accept any ID. This
   * means that this resolver cannot handle inner references to external
   * entities. Note that you can combine this resolver with another resolver
   * that can handle inner references (such as resolve_as_file); see
   * class 'combine' below.
   * If you pass the ~fixenc argument, the encoding of the string is
   * set to the passed value, regardless of any auto-recognition or
   * any XML declaration.
   *)


class resolve_read_any_string :
  string_of_id:(ext_id -> (string * encoding option)) -> unit -> resolver;;

  (* THIS CLASS IS DEPRECATED! USE resolve_to_any_obj_channel INSTEAD!
   *)

  (* resolver_read_any_string f_open ():
   * This resolver calls the function f_open to get the string for
   * the passed ext_id. This function must either return the string and
   * the encoding, or it must fail with Not_competent.
   * The function must return None as encoding if the default mechanism to
   * recognize the encoding should be used. It must return Some e if it is
   * already known that the encoding of the string is e.
   *)

val lookup_public_id_as_file :
  ?fixenc:encoding ->
  (string * string) list ->     (* catalog *)
    resolver;;
  (* Same as the equally named class *)

val lookup_public_id_as_string :
  ?fixenc:encoding ->
  (string * string) list ->     (* catalog *)
    resolver;;
  (* Same as the equally named class *)

val lookup_system_id_as_file :
  ?fixenc:encoding ->
  (string * string) list ->     (* catalog *)
    resolver;;
  (* Same as the equally named class *)

val lookup_system_id_as_string :
  ?fixenc:encoding ->
  (string * string) list ->     (* catalog *)
    resolver;;
  (* Same as the equally named class *)


This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml