Plasma GitLab Archive
Projects Blog Knowledge

(* $Id: neturl.mli 1565 2011-03-21 16:30:34Z ChriS $
 * ----------------------------------------------------------------------
 *
 *)

(* This module applies already O'Caml-3 features. *)

(** Uniform Resource Locators (URLs)
 *
 * {b Contents}
 *
 * - {!Neturl.interface}
 *
 * The tutorial has been moved to {!Neturl_tut}.
 *)

(** {1:interface Interface}
 *
 * This module provides functions to parse URLs, to print URLs, to
 * store URLs, to modify URLs, and to apply relative URLs.
 *
 * URLs are strings formed according to pattern (1) or (2):
 *
 * + [scheme://user;userparams:password\@host:port/path;params?query#fragment]
 * + [scheme:other;params?query#fragment]
 *
 * The word at the beginning of the URL identifies the URL scheme
 * (such as "http" or "file"). Depending on the scheme, not all of the
 * parts are allowed, or parts may be omitted. This module defines the
 * type [url_syntax] whose values describe which parts are allowed/required/
 * not allowed for a concrete URL scheme (see below).
 *
 * Not all characters are allowed in a URL. Some characters are allowed,
 * but have the special task to separate the various parts of the URL
 * (reserved characters).
 * However, it is possible to include even invalid or reserved characters
 * as normal content by applying the [%]-encoding on these characters:
 * A ['%'] indicates that an encoded character follows, and the character
 * is denoted by a two-digit hexadecimal number (e.g. [%2f] for ['/']).
 * In the following descriptions, the term "encoded string" means a string
 * containing such [%]-encoded characters, and the "decoded string" means a
 * string not containing such characters.
 * See the module {!Netencoding.Url} for functions encoding or decoding
 * strings.
 *
 * The type [url] describes values storing the components of a URL,
 * and the [url_syntax] for the URL. In general, the components are
 * stored as encoded strings; however, not for all components the
 * [%]-encoding is applicable.
 *
 * For convenience, the functions creating, modifying, and accessing
 * URLs can handle both encoded and decoded strings. In order to
 * avoid errors, the functions pass strings even in their decoded form.
 *
 * Note that there is currently no function to compare URLs. The
 * canoncical comparison ( [=] ) is not applicable because the same URL
 * may be written in different ways.
 *
 * Note that nothing is said about the character set/encoding of URLs.
 * Some protocols and standards prefer UTF-8 as fundamental encoding
 * and apply the [%]-encoding on top of it; i.e. the byte sequence
 * representing a character in UTF-8 is [%]-encoded.
 *
 * {b Standards Compliance}
 *
 * This module implements RFC 1738 and RFC 1808. There is also a newer
 * RFC, 2396, updating the former RFCs, but this module is not fully
 * compatible with RFC 2396. The following (minor) problems may occur:
 *
 * - The module escapes more characters than needed. All characters that
 *   are "unsafe" or "reserved" in either RFC document are escaped.
 * - URL parameters (appended with a ";") are handled as in RFCs 1738/1808.
 *   In RFC 2396, every path component may have parameters, and the
 *   algorithm to resolve relative URLs is different in this point.
 *   If it is required to apply RFC 2396, one can disable URL parameters
 *   in the syntax, and extract them from the path by a self-written
 *   postprocessor. Usually, this is only required for [imap] URLs.
 *
 * In one point, RFC 2396 is preferred:
 *
 * - Authorities may be terminated by a question mark, as in
 *   ["http://host?query"]. This is illegal in RFC 1738. The consequence
 *   is, however, that question marks in user strings must be escaped.
 *)

exception Malformed_URL
(** Raised by a number of functions when encountering a badly formed
 * URL.
 *)

val extract_url_scheme : string -> string
  (** Returns the URL scheme from the string representation of an URL.
   * E.g. [extract_url_scheme "http://host/path" = "http"].
   * The scheme name is always converted to lowercase characters.
   * Raises [Malformed_URL] if the scheme name is not found.
   *)

type url_syntax_option =
    Url_part_not_recognized  (** The part, even if there, is not even recognized *)
  | Url_part_allowed         (** The part can be present *)
  | Url_part_required        (** The part must be present *)


type url_syntax =
    { url_enable_scheme    : url_syntax_option;
      url_enable_user      : url_syntax_option;
      url_enable_user_param: url_syntax_option;
      url_enable_password  : url_syntax_option;
      url_enable_host      : url_syntax_option;
      url_enable_port      : url_syntax_option;
      url_enable_path      : url_syntax_option;
      url_enable_param     : url_syntax_option;
      url_enable_query     : url_syntax_option;
      url_enable_fragment  : url_syntax_option;
      url_enable_other     : url_syntax_option;
      url_accepts_8bits    : bool;
      url_is_valid         : url -> bool;
      url_enable_relative  : bool;
    }
(** Values of type [url_syntax] describe which components of an URL are
 * recognized, which are allowed (and optional), and which are required.
 * Not all combinations are valid; the predicate expressed by the
 * function [url_syntax_is_valid] must hold.
 *
 * The function [url_is_valid] is applied when a fresh URL is created
 * and must return [true]. This function allows it to add an arbitrary
 * validity criterion to [url_syntax]. (Note that the URL passed to
 * this function is not fully working; you can safely assume that the
 * accessor functions [url_scheme] etc. can be applied to it.)
 *
 * Switch [url_accepts_8bit]: If [true], the bytes with code 128 to
 * 255 are treated like alphanumeric characters; if [false] these bytes
 * are illegal (but it is still possible to include such byte in their
 * encoded form: [%80] to [%FF]).
 *
 * Switch [url_enable_relative]: If [true], the syntax allows relative
 * URLs in principle. Actually, parsing of relative URLs is possible
 * when the optional parts are flagged as [Url_part_allowed] and not
 * as [Url_part_required]. However, it is useful to specify URL syntaxes
 * always as absolute URLs, and to weaken them on demand when a relative
 * URL is found by the parser. This switch enables that. In particular,
 * the function [partial_url_syntax] checks this flag.
 *)

and url
 (** Values of type [url] describe concrete URLs. Every URL must have
 * a fundamental [url_syntax], and it is only possible to create URLs
 * conforming to the syntax. See [make_url] for further information.
 *)
;;



val url_syntax_is_valid : url_syntax -> bool
  (** Checks whether the passed [url_syntax] is valid. This means:
   * - If passwords are recognized, users (and hosts) must be recognized, too
   * - If ports are recognized, hosts must be recognized, too
   * - If users are recognized, hosts must be recognized, too
   * - Either the syntax recognizes one of the phrases
   *   \{ user, password, host, port, path \}, or the syntax recognized
   *   the phrase 'other'.
   *)


val partial_url_syntax : url_syntax -> url_syntax
  (** Transforms the syntax into another syntax where all required parts are
   * changed into optional parts.
   *)


(* Note that all following url_syntaxes do not allow 8bit bytes. *)

val null_url_syntax   : url_syntax
  (** An URL syntax that recognizes nothing. Use this as base for your own
   * definitions, e.g.
   * {[
   * let my_syntax = { null_url_syntax with
   *                     url_enable_host = Url_part_required; ... }
   * ]}
   *)

val ip_url_syntax : url_syntax
  (** Syntax for IP based protocols. This syntax allows scheme, user,
   * password, host, port, path, param, query, fragment, but not "other".
   * It does not accept 8 bit bytes.
   *)

val common_url_syntax : (string, url_syntax) Hashtbl.t
  (** Syntax descriptions for common URL schemes. The key of the hashtable
   * is the scheme name, and the value is the corresponding syntax.
   *
   * - ["file"]: scheme, host?, path
   * - ["ftp"]: scheme, user?, password?, host, port?, path?, param?
   *   Note: param is not checked.
   * - ["http"], ["https"]:
   *   scheme, user?, password?, host, port?, path?, query?
   * - ["mailto"]: scheme, other, query? (RFC 2368)
   * - ["pop"], ["pops"]: scheme, user?, user_param?, password?, host, port?
   *   Note: user_param is not checked.
   *   (RFC 2384)
   * - ["imap"], ["imaps"]: scheme, user?, user_param?, password?, host, port?,
   *   path?, query? (RFC 2192)
   *   Note: "param" is intentionally not recognized to get the resolution of
   *   relative URLs as described in the RFC. When analysing this kind of URL,
   *   it is recommended to re-parse it with "param" enabled.
   * - ["news"]: scheme, other (RFC 1738)
   * - ["nntp"], ["nntps"]: scheme, host, port?, path (with two components)
   *   (RFC 1738)
   * - ["data"]: scheme, other (RFC 2397). "other" is not further decomposed.
   * - ["ipp"], ["ipps"]: scheme, host, port? , path?, query? (RFC 3510)
   * - ["cid"], ["mid"]: Content/message identifiers: scheme, other
   *
   * Notes:
   * - These syntax descriptions can be weakened for partial/relative URLs
   *   by changing the required parts to optional parts: See the function
   *   [partial_url_syntax].
   * - None of the descriptions allows fragments. These can be enabled by
   *   setting [url_enable_fragment] to [Url_part_allowed]. E.g.
   *   {[ { file_url_syntax with url_enable_fragment = Url_part_allowed } ]}
   * - 8 bit bytes are not accepted
   * - A large number of standardised scheme syntaxes are not available,
   *   e.g. gopher, prospero, wais. The selection is a bit subjective,
   *   but I have tried to omit protocols that are no longer in common
   *   use, or that are very special.
   * - The LDAP URL syntax (RFC 1959) does not fit into our scheme, it
   *   is omitted for now because of this.
   *)

val null_url : url
  (** A URL without any component and [null_url_syntax]
   *)

val make_url :
      ?encoded:bool ->
      ?scheme:string ->
      ?user:string ->
      ?user_param:string list ->
      ?password:string ->
      ?host:string ->
      ?port:int ->
      ?path:string list ->
      ?param:string list ->
      ?query:string ->
      ?fragment:string ->
      ?other:string ->
      url_syntax ->
      url
  (** Creates a URL from components:
   *
   * - The components [scheme] and [host] are simple strings to which the
   *   [%]-encoding is not applicable.
   * - The component [port] is a simple number. Of course, the [%]-encoding
   *   is not applicable, too.
   * - The components [user], [password], [query], [fragment], and [other]
   *   are strings which may contain [%]-encoded characters. By default,
   *   you can pass any string for these components, and problematic characters
   *   are automatically encoded. If you set [encoded:true], the passed
   *   strings must already be encoded, but the function checks whether
   *   the encoding is syntactically correct.
   *   Note that for [query] even the characters ['?'] and ['='] are encoded
   *   by default, so you need to set [encoded:true] to pass a reasonable
   *   query string.
   * - The components [user_param], [path] and [param] are lists of strings which may
   *   contain [%]-encoded characters. Again, the default is to pass
   *   decoded strings to the function, and the function encodes them
   *   automatically, and by setting [encoded:true] the caller is responsible
   *   for encoding the strings. Passing empty lists for these components
   *   means that they are not part of the constructed URL.
   *   See below for the respresentation of these components.
   *
   * The strings representing the components do not
   * contain the characters separating the components from each other.
   *
   * The created URL must conform to the [url_syntax], i.e.:
   * - The URL must only contain components which are recognized by the
   *   syntax
   * - The URL must contain components which are required by the syntax
   * - The URL must fulfill the predicate expressed by the [url_is_valid]
   *   function of the syntax.
   *
   * The path of a URL is represented as a list of ['/']-separated path
   * components. i.e.
   *
   * [ [ s1; s2; ...; sN ] ]  represents the path
   *                        [s1 ^ "/" ^ s2 ^ "/" ^ ... ^ "/" ^ sN]
   *
   * As special cases:
   * -  [[]]                   is the non-existing path
   * -  [[ "" ]]               is ["/"]
   * -  [[ "";"" ]]            is illegal
   *
   * Except of [s1] and [sN], the path components must not be empty strings.
   *
   * To avoid ambiguities, it is illegal to create URLs with both relative
   * paths ([s1 <> ""]) and host components.
   *
   * Parameters of URLs ([param] and [user_param]) are components
   * beginning with [';']. The list
   * of parameters is represented as list of strings where the strings
   * contain the value following [';'].
   *)

val modify_url :
      ?syntax:url_syntax ->
      ?encoded:bool ->
      ?scheme:string ->
      ?user:string ->
      ?user_param:string list ->
      ?password:string ->
      ?host:string ->
      ?port:int ->
      ?path:string list ->
      ?param:string list ->
      ?query:string ->
      ?fragment:string ->
      ?other:string ->
      url ->
      url
  (** Modifies the passed components and returns the modified URL.
   * The modfied URL shares unmodified components with the original
   * URL.
   *)

val remove_from_url :
      ?scheme:bool ->
      ?user:bool ->
      ?user_param:bool ->
      ?password:bool ->
      ?host:bool ->
      ?port:bool ->
      ?path:bool ->
      ?param:bool ->
      ?query:bool ->
      ?fragment:bool ->
      ?other:bool ->
      url ->
      url
  (** Removes the [true] components from the URL, and returns the modified
   * URL.
   * The modfied URL shares unmodified components with the original
   * URL.
   *)

val default_url :
      ?encoded:bool ->
      ?scheme:string ->
      ?user:string ->
      ?user_param:string list ->
      ?password:string ->
      ?host:string ->
      ?port:int ->
      ?path:string list ->
      ?param:string list ->
      ?query:string ->
      ?fragment:string ->
      ?other:string ->
      url ->
      url
  (** Adds missing components and returns the modified URL.
   * The modfied URL shares unmodified components with the original
   * URL.
   *)

val undefault_url :
      ?scheme:string ->
      ?user:string ->
      ?user_param:string list ->
      ?password:string ->
      ?host:string ->
      ?port:int ->
      ?path:string list ->
      ?param:string list ->
      ?query:string ->
      ?fragment:string ->
      ?other:string ->
      url ->
      url
  (** Removes components from the URL if they have the passed value, and
   * returns the modified URL.
   * Note: The values must always be passed in {b encoded} form!
   * The modfied URL shares unmodified components with the original
   * URL.
   *)

val url_syntax_of_url : url -> url_syntax
  (** Returns the [url_syntax] record of a URL. *)

val url_of_string : url_syntax -> string -> url
  (** Parses the passed string according to the passed [url_syntax]. *)

val string_of_url : url -> string
  (** Returns the URL as string *)

val parse_url :
      ?schemes:(string, url_syntax) Hashtbl.t ->
      ?base_syntax:url_syntax ->
      ?accept_8bits:bool ->
      ?enable_fragment:bool ->
      string -> url
  (** Parses the string and returns the URL the string represents.
   * If the URL is absolute (i.e. begins with a scheme like
   * "http:..."), the syntax will be looked up in [schemes].
   * If the URL is relative, the [base_syntax] will be taken
   * if passed. Without [base_syntax], relative URLs cannot be
   * parsed.
   *
   * @param schemes This hashtable maps scheme names to syntax descriptions.
   *   The default is [common_url_syntax].
   * @param base_syntax If passed, the function can parse relative URLs
   *   according to this syntax. If not passed, the function will raise
   *   [Malformed_URL] on a relative URL.
   * @param accept_8bits If [false], the default, it depends on the
   *   syntax descriptions in [schemes] whether 8 bit characters are
   *   accepted in the input or not. If [true], 8 bit characters are
   *   always accepted.
   * @param enable_fragment If [false], the default, it depends on the
   *   syntax descriptions in [schemes] whether fragment identifiers
   *   (e.g. "#fragment") are recognized or not. If [true], fragments
   *   are always recognized.
   *)

val fixup_url_string : string -> string
  (** Escapes some unsafe or "unwise" characters that are commonly used
    * in URL strings: space, < > \{ \} \[ \] ^ \\ | and double quotes.
    * Call this function before parsing the URL to support these
    * characters.
   *)

val url_provides :
      ?scheme:bool ->
      ?user:bool ->
      ?user_param:bool ->
      ?password:bool ->
      ?host:bool ->
      ?port:bool ->
      ?path:bool ->
      ?param:bool ->
      ?query:bool ->
      ?fragment:bool ->
      ?other:bool ->
      url ->
      bool
  (** Returns [true] iff the URL has all of the components passed with
   * [true] value.
   *)

val url_scheme    :                  url -> string
val url_user      : ?encoded:bool -> url -> string
val url_user_param: ?encoded:bool -> url -> string list
val url_password  : ?encoded:bool -> url -> string
val url_host      :                  url -> string
val url_port      :                  url -> int
val url_path      : ?encoded:bool -> url -> string list
val url_param     : ?encoded:bool -> url -> string list
val url_query     : ?encoded:bool -> url -> string
val url_fragment  : ?encoded:bool -> url -> string
val url_other     : ?encoded:bool -> url -> string
  (** Return components of the URL. The functions return decoded strings
   * unless [encoded:true] is set.
   * If the component does not exist, the exception [Not_found]
   * is raised.
   *)

val split_path : string -> string list
  (** Splits a ['/']-separated path into components (e.g. to set up the
   * [path] argument of [make_url]).
   * E.g.
   * {[
   * split_path "a/b/c" = [ "a"; "b"; "c" ],
   * split_path "/a/b"  = [ ""; "a"; "b" ],
   * split_path "a/b/"  = [ "a"; "b"; "" ] ]}
   * Beware that [split_path ".."] returns [[".."]] while [split_path "../"]
   * returns [[".."; ""]].  The two will behave differently, for example
   * when used with {!Neturl.apply_relative_url}.
   *)

val join_path : string list -> string
  (** Concatenates the path components (reverse function of split_path).
   *)

val norm_path : string list -> string list
  (** Removes ["."] and [".."] from the path if possible. Deletes double slashes.
   *
   * {b Examples}
   *
   * {ul
   * {- [norm_path ["."] = []]
   *
   *           means: "." = ""}
   * {- [norm_path ["."; ""] = []]
   *
   *           means: "./" = ""}
   * {- [norm_path ["a"; "."] = ["a"; ""]]
   *
   *           means: "a/." = "a/"}
   * {- [norm_path ["a"; "b"; "."] = ["a"; "b"; ""]]
   *
   *           means: "a/b/." = "a/b/"}
   * {- [norm_path ["a"; "."; "b"; "."] = ["a"; "b"; ""]]
   *
   *           means: "a/./b/." = "a/b/"}
   * {- [norm_path [".."] = [".."; ""]]
   *
   *           means: ".." = "../"}
   * {- [norm_path [".."; ""] = [".."; ""]]
   *
   *           means: "../" = "../"}
   * {- [norm_path ["a"; "b"; ".."; "c" ] = ["a"; "c"]]
   *
   *           means: "a/b/../c" = "a/c"}
   * {- [norm_path ["a"; "b"; ".."; "c"; ""] = ["a"; "c"; ""]]
   *
   *           means: "a/b/../c/" = "a/c/"}
   * {- [norm_path ["";"";"a";"";"b"] = [""; "a"; "b"]]
   *
   *           means: "//a//b" = "/a/b"}
   * {- [norm_path ["a"; "b"; ""; ".."; "c"; ""] = ["a"; "c"; ""]]
   *
   *           means: "a/b//../c/" = "a/c/"}
   * {- [norm_path ["a"; ".."] = []]
   *
   *           means: "a/.." = ""}
   * }
   *)


val apply_relative_url : url -> url -> url
  (** [apply_relative_url base rel]:
   * Interprets [rel] relative to [base] and returns the new URL. This
   * function implements RFC 1808.
   *
   * It is not necessary that [rel] has the same syntax as [base].
   * Note, however, that it is checked whether the resulting URL is
   * syntactically correct with the syntax of [base]. If not, the
   * exception [Malformed_URL] will be raised.
   *
   * Examples (the URLs are represented as strings, see {!Neturl.split_path}
   * to split them for {!Neturl.make_url}):
   *
   * base="x/y", url="a/b" => result="x/a/b"
   * base="x/y/", url="a/b" => result="x/y/a/b"
   * base="x/y/..", url="a/b" => result="x/y/a/b"   (beware!)
   * base="x/y/../", url="a/b" => result="x/a/b"
   *)

val ensure_absolute_url : ?base:url -> url -> url
  (** If the anonymous URL is absolute, it is just returned as result of
   * this function. If the URL is relative, it is tried to make it
   * absolute by resolving it relative to [base]. If there is no [base]
   * or if the the base URL does not allow the parts that would be added
   * (e.g. if the anonymous URL possesses a fragment and [base] does not
   * allow that), this will fail, and the function raises [Malformed_URL].
   *)

val file_url_of_local_path : ?getcwd:(unit -> string) -> string -> url
  (** Generates a URL with "file" scheme from the passed path name. The
   * URL is always absolute, i.e. the current directory is prepended if the
   * path is not absolute.
   *
   * Note that no character set conversions are performed.
   *
   * Win32: The input path name may use forward or backward slashes.
   * Absolute paths with drive letters and UNC paths are recognised.
   * Relative paths with drive letters, however, are not recognised
   * (e.g. ["c:file"]), as it is not possible to access the drive-specific
   * working directory from the O'Caml runtime.
   *
   * Cygwin: The input path name may use forward or backward slashes.
   * Absolute paths with drive letters and UNC paths are recognised.
   * The former are translated to ["/cygdrive"] names.
   *
   * @param getcwd The function returns the path taken as current working
   *   directory. Note that for
   *   Win32 this must be either an absolute name with drive letter,
   *   or an UNC path. Default: [Sys.getcwd]
   *)

val local_path_of_file_url : url -> string
  (** Extracts the path from an absolute file URL, and returns a
   * correct path name.
   *
   * If the URL is not a file URL, or is not absolute, the function will
   * fail.
   *
   * Win32: The URL must either contain a drive letter, or must refer
   * to another host.
   *
   * Cygwin: Drive letters and remote URLs are recognised.
   *)


val print_url : url -> unit
  (** Printer for the toploop. *)

(* ---------------------------------------------------------------------- *)

(* Special accessors *)

(*
val mailto_recipients : url -> Netaddress.mailbox list
- Returns all recipients, incl the recpients in the "to" header

val ftp_type : url -> XXX
- Returns the FTP transfer type

val auth_param : url -> XXX
- if the URL has an AUTH style [user_param] the value is returned

val data_content_type : url -> XXX
val data_contents : url -> XXX
- Decomposes "data" URLs
*)

(* ---------------------------------------------------------------------- *)

(* EXAMPLES:
 *
 * let http = Hashtbl.find common_url_syntax "http";;
 * let u = url_of_string http "http://g:pw@host/a/%62/";;
 * string_of_url u;;
 *   --> "http://g:pw@host/a/%62/"
 * url_scheme u;;
 *   --> "http"
 * url_user u;;
 *   --> "g"
 * url_password u;;
 *   --> "pw"
 * url_host u;;
 *   --> "host"
 * url_path u;;
 *   --> [ ""; "a"; "b"; "" ]          (* sic! *)
 * url_path ~encoded:true u;;
 *   --> [ ""; "a"; "%62"; "" ]
 * let v = make_url
 *   ~path:[ ".."; "c" ]
 *   ~fragment:"near-the-#-character"
 *   { (partial_url_syntax http) with url_enable_fragment = Url_part_allowed };;
 * string_of_url v;;
 *   --> "../c#near-the-%23-character"
 * let u' = modify_url ~syntax:(url_syntax_of_url v) u;;
 *    (* u does not permit fragments *)
 * let w = apply_relative_url u' v;;
 * string_of_url w;;
 *   --> "http://g:pw@host/c#near-the-%23-character"
 *)

(* ---------------------------------------------------------------------- *)


This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml