(* $Id: neturl.mli 1662 2011-08-29 23:05:06Z gerd $
* ----------------------------------------------------------------------
*
*)
(* This module applies already O'Caml-3 features. *)
(** Uniform Resource Locators (URLs)
*
* {b Contents}
*
* - {!Neturl.interface}
*
* The tutorial has been moved to {!Neturl_tut}.
*)
(** {1:interface Interface}
*
* This module provides functions to parse URLs, to print URLs, to
* store URLs, to modify URLs, and to apply relative URLs.
*
* URLs are strings formed according to pattern (1) or (2):
*
* + [scheme://user;userparams:password\@host:port/path;params?query#fragment]
* + [scheme:other;params?query#fragment]
*
* The word at the beginning of the URL identifies the URL scheme
* (such as "http" or "file"). Depending on the scheme, not all of the
* parts are allowed, or parts may be omitted. This module defines the
* type [url_syntax] whose values describe which parts are allowed/required/
* not allowed for a concrete URL scheme (see below).
*
* Not all characters are allowed in a URL. Some characters are allowed,
* but have the special task to separate the various parts of the URL
* (reserved characters).
* However, it is possible to include even invalid or reserved characters
* as normal content by applying the [%]-encoding on these characters:
* A ['%'] indicates that an encoded character follows, and the character
* is denoted by a two-digit hexadecimal number (e.g. [%2f] for ['/']).
* In the following descriptions, the term "encoded string" means a string
* containing such [%]-encoded characters, and the "decoded string" means a
* string not containing such characters.
* See the module {!Netencoding.Url} for functions encoding or decoding
* strings.
*
* The type [url] describes values storing the components of a URL,
* and the [url_syntax] for the URL. In general, the components are
* stored as encoded strings; however, not for all components the
* [%]-encoding is applicable.
*
* For convenience, the functions creating, modifying, and accessing
* URLs can handle both encoded and decoded strings. In order to
* avoid errors, the functions pass strings even in their decoded form.
*
* Note that there is currently no function to compare URLs. The
* canoncical comparison ( [=] ) is not applicable because the same URL
* may be written in different ways.
*
* Note that nothing is said about the character set/encoding of URLs.
* Some protocols and standards prefer UTF-8 as fundamental encoding
* and apply the [%]-encoding on top of it; i.e. the byte sequence
* representing a character in UTF-8 is [%]-encoded.
*
* {b Standards Compliance}
*
* This module implements RFC 1738 and RFC 1808. There is also a newer
* RFC, 2396, updating the former RFCs, but this module is not fully
* compatible with RFC 2396. The following (minor) problems may occur:
*
* - The module escapes more characters than needed. All characters that
* are "unsafe" or "reserved" in either RFC document are escaped.
* - URL parameters (appended with a ";") are handled as in RFCs 1738/1808.
* In RFC 2396, every path component may have parameters, and the
* algorithm to resolve relative URLs is different in this point.
* If it is required to apply RFC 2396, one can disable URL parameters
* in the syntax, and extract them from the path by a self-written
* postprocessor. Usually, this is only required for [imap] URLs.
*
* In one point, RFC 2396 is preferred:
*
* - Authorities may be terminated by a question mark, as in
* ["http://host?query"]. This is illegal in RFC 1738. The consequence
* is, however, that question marks in user strings must be escaped.
*
* RFC 3986 introduces IPv6 addresses. These are now supported (but see
* the comments below).
*)
exception Malformed_URL
(** Raised by a number of functions when encountering a badly formed
* URL.
*)
val extract_url_scheme : string -> string
(** Returns the URL scheme from the string representation of an URL.
* E.g. [extract_url_scheme "http://host/path" = "http"].
* The scheme name is always converted to lowercase characters.
* Raises [Malformed_URL] if the scheme name is not found.
*)
type url_syntax_option =
Url_part_not_recognized (** The part, even if there, is not even recognized *)
| Url_part_allowed (** The part can be present *)
| Url_part_required (** The part must be present *)
type url_syntax =
{ url_enable_scheme : url_syntax_option;
url_enable_user : url_syntax_option;
url_enable_user_param: url_syntax_option;
url_enable_password : url_syntax_option;
url_enable_host : url_syntax_option;
url_enable_port : url_syntax_option;
url_enable_path : url_syntax_option;
url_enable_param : url_syntax_option;
url_enable_query : url_syntax_option;
url_enable_fragment : url_syntax_option;
url_enable_other : url_syntax_option;
url_accepts_8bits : bool;
url_is_valid : url -> bool;
url_enable_relative : bool;
}
(** Values of type [url_syntax] describe which components of an URL are
* recognized, which are allowed (and optional), and which are required.
* Not all combinations are valid; the predicate expressed by the
* function [url_syntax_is_valid] must hold.
*
* The function [url_is_valid] is applied when a fresh URL is created
* and must return [true]. This function allows it to add an arbitrary
* validity criterion to [url_syntax]. (Note that the URL passed to
* this function is not fully working; you can safely assume that the
* accessor functions [url_scheme] etc. can be applied to it.)
*
* Switch [url_accepts_8bit]: If [true], the bytes with code 128 to
* 255 are treated like alphanumeric characters; if [false] these bytes
* are illegal (but it is still possible to include such byte in their
* encoded form: [%80] to [%FF]).
*
* Switch [url_enable_relative]: If [true], the syntax allows relative
* URLs in principle. Actually, parsing of relative URLs is possible
* when the optional parts are flagged as [Url_part_allowed] and not
* as [Url_part_required]. However, it is useful to specify URL syntaxes
* always as absolute URLs, and to weaken them on demand when a relative
* URL is found by the parser. This switch enables that. In particular,
* the function [partial_url_syntax] checks this flag.
*)
and url
(** Values of type [url] describe concrete URLs. Every URL must have
* a fundamental [url_syntax], and it is only possible to create URLs
* conforming to the syntax. See [make_url] for further information.
*)
;;
val url_syntax_is_valid : url_syntax -> bool
(** Checks whether the passed [url_syntax] is valid. This means:
* - If passwords are recognized, users (and hosts) must be recognized, too
* - If ports are recognized, hosts must be recognized, too
* - If users are recognized, hosts must be recognized, too
* - Either the syntax recognizes one of the phrases
* \{ user, password, host, port, path \}, or the syntax recognized
* the phrase 'other'.
*)
val partial_url_syntax : url_syntax -> url_syntax
(** Transforms the syntax into another syntax where all required parts are
* changed into optional parts.
*)
(* Note that all following url_syntaxes do not allow 8bit bytes. *)
val null_url_syntax : url_syntax
(** An URL syntax that recognizes nothing. Use this as base for your own
* definitions, e.g.
* {[
* let my_syntax = { null_url_syntax with
* url_enable_host = Url_part_required; ... }
* ]}
*)
val ip_url_syntax : url_syntax
(** Syntax for IP based protocols. This syntax allows scheme, user,
* password, host, port, path, param, query, fragment, but not "other".
* It does not accept 8 bit bytes.
*)
val common_url_syntax : (string, url_syntax) Hashtbl.t
(** Syntax descriptions for common URL schemes. The key of the hashtable
* is the scheme name, and the value is the corresponding syntax.
*
* - ["file"]: scheme, host?, path
* - ["ftp"]: scheme, user?, password?, host, port?, path?, param?
* Note: param is not checked.
* - ["http"], ["https"]:
* scheme, user?, password?, host, port?, path?, query?
* - ["mailto"]: scheme, other, query? (RFC 2368)
* - ["pop"], ["pops"]: scheme, user?, user_param?, password?, host, port?
* Note: user_param is not checked.
* (RFC 2384)
* - ["imap"], ["imaps"]: scheme, user?, user_param?, password?, host, port?,
* path?, query? (RFC 2192)
* Note: "param" is intentionally not recognized to get the resolution of
* relative URLs as described in the RFC. When analysing this kind of URL,
* it is recommended to re-parse it with "param" enabled.
* - ["news"]: scheme, other (RFC 1738)
* - ["nntp"], ["nntps"]: scheme, host, port?, path (with two components)
* (RFC 1738)
* - ["data"]: scheme, other (RFC 2397). "other" is not further decomposed.
* - ["ipp"], ["ipps"]: scheme, host, port? , path?, query? (RFC 3510)
* - ["cid"], ["mid"]: Content/message identifiers: scheme, other
*
* Notes:
* - These syntax descriptions can be weakened for partial/relative URLs
* by changing the required parts to optional parts: See the function
* [partial_url_syntax].
* - None of the descriptions allows fragments. These can be enabled by
* setting [url_enable_fragment] to [Url_part_allowed]. E.g.
* {[ { file_url_syntax with url_enable_fragment = Url_part_allowed } ]}
* - 8 bit bytes are not accepted
* - A large number of standardised scheme syntaxes are not available,
* e.g. gopher, prospero, wais. The selection is a bit subjective,
* but I have tried to omit protocols that are no longer in common
* use, or that are very special.
* - The LDAP URL syntax (RFC 1959) does not fit into our scheme, it
* is omitted for now because of this.
*)
val null_url : url
(** A URL without any component and [null_url_syntax]
*)
val make_url :
?encoded:bool ->
?scheme:string ->
?user:string ->
?user_param:string list ->
?password:string ->
?host:string ->
?addr:Unix.inet_addr ->
?port:int ->
?socksymbol: Netsockaddr.socksymbol ->
?path:string list ->
?param:string list ->
?query:string ->
?fragment:string ->
?other:string ->
url_syntax ->
url
(** Creates a URL from components:
*
* - The components [scheme] and [host] are simple strings to which the
* [%]-encoding is not applicable. [host] may be a (DNS) name, an
* IPv4 address as "dotted quad", or an IPv6 address enclosed in
* brackets.
* - [addr] also sets [host], but directly from an [inet_addr].
* - The component [port] is a simple number. Of course, the [%]-encoding
* is not applicable, too.
* - [socksymbol] sets both [host] and [port] from the socksymbol of
* type [`Inet] or [`Inet_byname].
* - The components [user], [password], [query], [fragment], and [other]
* are strings which may contain [%]-encoded characters. By default,
* you can pass any string for these components, and problematic characters
* are automatically encoded. If you set [encoded:true], the passed
* strings must already be encoded, but the function checks whether
* the encoding is syntactically correct.
* Note that for [query] even the characters ['?'] and ['='] are encoded
* by default, so you need to set [encoded:true] to pass a reasonable
* query string.
* - The components [user_param], [path] and [param] are lists of strings which may
* contain [%]-encoded characters. Again, the default is to pass
* decoded strings to the function, and the function encodes them
* automatically, and by setting [encoded:true] the caller is responsible
* for encoding the strings. Passing empty lists for these components
* means that they are not part of the constructed URL.
* See below for the respresentation of these components.
*
* [socksymbol] has precedence over [addr], which has precedence over
* [host]. [socksymbol] also has precedence over [port].
*
* The strings representing the components do not
* contain the characters separating the components from each other.
*
* The created URL must conform to the [url_syntax], i.e.:
* - The URL must only contain components which are recognized by the
* syntax
* - The URL must contain components which are required by the syntax
* - The URL must fulfill the predicate expressed by the [url_is_valid]
* function of the syntax.
*
* The path of a URL is represented as a list of ['/']-separated path
* components. i.e.
*
* [ [ s1; s2; ...; sN ] ] represents the path
* [s1 ^ "/" ^ s2 ^ "/" ^ ... ^ "/" ^ sN]
*
* As special cases:
* - [[]] is the non-existing path
* - [[ "" ]] is ["/"]
* - [[ "";"" ]] is illegal
*
* Except of [s1] and [sN], the path components must not be empty strings.
*
* To avoid ambiguities, it is illegal to create URLs with both relative
* paths ([s1 <> ""]) and host components.
*
* Parameters of URLs ([param] and [user_param]) are components
* beginning with [';']. The list
* of parameters is represented as list of strings where the strings
* contain the value following [';'].
*)
val modify_url :
?syntax:url_syntax ->
?encoded:bool ->
?scheme:string ->
?user:string ->
?user_param:string list ->
?password:string ->
?host:string ->
?addr:Unix.inet_addr ->
?port:int ->
?socksymbol: Netsockaddr.socksymbol ->
?path:string list ->
?param:string list ->
?query:string ->
?fragment:string ->
?other:string ->
url ->
url
(** Modifies the passed components and returns the modified URL.
* The modfied URL shares unmodified components with the original
* URL.
*)
val remove_from_url :
?scheme:bool ->
?user:bool ->
?user_param:bool ->
?password:bool ->
?host:bool ->
?port:bool ->
?path:bool ->
?param:bool ->
?query:bool ->
?fragment:bool ->
?other:bool ->
url ->
url
(** Removes the [true] components from the URL, and returns the modified
* URL.
* The modfied URL shares unmodified components with the original
* URL.
*)
val default_url :
?encoded:bool ->
?scheme:string ->
?user:string ->
?user_param:string list ->
?password:string ->
?host:string ->
?port:int ->
?path:string list ->
?param:string list ->
?query:string ->
?fragment:string ->
?other:string ->
url ->
url
(** Adds missing components and returns the modified URL.
* The modfied URL shares unmodified components with the original
* URL.
*)
val undefault_url :
?scheme:string ->
?user:string ->
?user_param:string list ->
?password:string ->
?host:string ->
?port:int ->
?path:string list ->
?param:string list ->
?query:string ->
?fragment:string ->
?other:string ->
url ->
url
(** Removes components from the URL if they have the passed value, and
* returns the modified URL.
* Note: The values must always be passed in {b encoded} form!
* The modfied URL shares unmodified components with the original
* URL.
*)
val url_syntax_of_url : url -> url_syntax
(** Returns the [url_syntax] record of a URL. *)
val url_of_string : url_syntax -> string -> url
(** Parses the passed string according to the passed [url_syntax]. *)
val string_of_url : url -> string
(** Returns the URL as string *)
val parse_url :
?schemes:(string, url_syntax) Hashtbl.t ->
?base_syntax:url_syntax ->
?accept_8bits:bool ->
?enable_fragment:bool ->
string -> url
(** Parses the string and returns the URL the string represents.
* If the URL is absolute (i.e. begins with a scheme like
* "http:..."), the syntax will be looked up in [schemes].
* If the URL is relative, the [base_syntax] will be taken
* if passed. Without [base_syntax], relative URLs cannot be
* parsed.
*
* @param schemes This hashtable maps scheme names to syntax descriptions.
* The default is [common_url_syntax].
* @param base_syntax If passed, the function can parse relative URLs
* according to this syntax. If not passed, the function will raise
* [Malformed_URL] on a relative URL.
* @param accept_8bits If [false], the default, it depends on the
* syntax descriptions in [schemes] whether 8 bit characters are
* accepted in the input or not. If [true], 8 bit characters are
* always accepted.
* @param enable_fragment If [false], the default, it depends on the
* syntax descriptions in [schemes] whether fragment identifiers
* (e.g. "#fragment") are recognized or not. If [true], fragments
* are always recognized.
*)
val fixup_url_string : ?escape_hash:bool -> string -> string
(** Escapes some unsafe or "unwise" characters that are commonly used
* in URL strings: space, < > \{ \} ^ \\ | and double quotes.
* Call this function before parsing the URL to support these
* characters.
*
* If [escape_hash] is set, '#' is also escaped.
*
* Change: Since Ocamlnet-3.4, square brackets are no longer fixed up,
* because they have now a legal use to denote IPv6 addresses.
*)
val url_provides :
?scheme:bool ->
?user:bool ->
?user_param:bool ->
?password:bool ->
?host:bool ->
?port:bool ->
?path:bool ->
?param:bool ->
?query:bool ->
?fragment:bool ->
?other:bool ->
url ->
bool
(** Returns [true] iff the URL has all of the components passed with
* [true] value.
*)
val url_scheme : url -> string
val url_user : ?encoded:bool -> url -> string
val url_user_param: ?encoded:bool -> url -> string list
val url_password : ?encoded:bool -> url -> string
val url_host : url -> string
val url_port : url -> int
val url_path : ?encoded:bool -> url -> string list
val url_param : ?encoded:bool -> url -> string list
val url_query : ?encoded:bool -> url -> string
val url_fragment : ?encoded:bool -> url -> string
val url_other : ?encoded:bool -> url -> string
(** Return components of the URL. The functions return decoded strings
* unless [encoded:true] is set.
* If the component does not exist, the exception [Not_found]
* is raised.
*
* Note that IPv6 addresses, when returned by [url_host], are enclosed
* in square brackets. Modules calling [url_host] may require porting
* to support this syntax variant.
*)
val url_addr : url -> Unix.inet_addr
(** If the [host] part of the URL is an IP address, the address is returned.
Works for IPv4 and IPv6 addresses. Otherwise [Not_found] is raised.
*)
val url_socksymbol : url -> int -> Netsockaddr.socksymbol
(** [url_socksymbol url default_port]: Returns the [host] and [port] parts
of the URL as [socksymbol]. If the port is missing in the URL,
[default_port] is substituted. If the [host] is missing in the URL
the exception [Not_found] is raised.
*)
val split_path : string -> string list
(** Splits a ['/']-separated path into components (e.g. to set up the
* [path] argument of [make_url]).
* E.g.
* {[
* split_path "a/b/c" = [ "a"; "b"; "c" ],
* split_path "/a/b" = [ ""; "a"; "b" ],
* split_path "a/b/" = [ "a"; "b"; "" ] ]}
* Beware that [split_path ".."] returns [[".."]] while [split_path "../"]
* returns [[".."; ""]]. The two will behave differently, for example
* when used with {!Neturl.apply_relative_url}.
*)
val join_path : string list -> string
(** Concatenates the path components (reverse function of split_path).
*)
val norm_path : string list -> string list
(** Removes ["."] and [".."] from the path if possible. Deletes double slashes.
*
* {b Examples}
*
* {ul
* {- [norm_path ["."] = []]
*
* means: "." = ""}
* {- [norm_path ["."; ""] = []]
*
* means: "./" = ""}
* {- [norm_path ["a"; "."] = ["a"; ""]]
*
* means: "a/." = "a/"}
* {- [norm_path ["a"; "b"; "."] = ["a"; "b"; ""]]
*
* means: "a/b/." = "a/b/"}
* {- [norm_path ["a"; "."; "b"; "."] = ["a"; "b"; ""]]
*
* means: "a/./b/." = "a/b/"}
* {- [norm_path [".."] = [".."; ""]]
*
* means: ".." = "../"}
* {- [norm_path [".."; ""] = [".."; ""]]
*
* means: "../" = "../"}
* {- [norm_path ["a"; "b"; ".."; "c" ] = ["a"; "c"]]
*
* means: "a/b/../c" = "a/c"}
* {- [norm_path ["a"; "b"; ".."; "c"; ""] = ["a"; "c"; ""]]
*
* means: "a/b/../c/" = "a/c/"}
* {- [norm_path ["";"";"a";"";"b"] = [""; "a"; "b"]]
*
* means: "//a//b" = "/a/b"}
* {- [norm_path ["a"; "b"; ""; ".."; "c"; ""] = ["a"; "c"; ""]]
*
* means: "a/b//../c/" = "a/c/"}
* {- [norm_path ["a"; ".."] = []]
*
* means: "a/.." = ""}
* }
*)
val apply_relative_url : url -> url -> url
(** [apply_relative_url base rel]:
* Interprets [rel] relative to [base] and returns the new URL. This
* function implements RFC 1808.
*
* It is not necessary that [rel] has the same syntax as [base].
* Note, however, that it is checked whether the resulting URL is
* syntactically correct with the syntax of [base]. If not, the
* exception [Malformed_URL] will be raised.
*
* Examples (the URLs are represented as strings, see {!Neturl.split_path}
* to split them for {!Neturl.make_url}):
*
* base="x/y", url="a/b" => result="x/a/b"
* base="x/y/", url="a/b" => result="x/y/a/b"
* base="x/y/..", url="a/b" => result="x/y/a/b" (beware!)
* base="x/y/../", url="a/b" => result="x/a/b"
*)
val ensure_absolute_url : ?base:url -> url -> url
(** If the anonymous URL is absolute, it is just returned as result of
* this function. If the URL is relative, it is tried to make it
* absolute by resolving it relative to [base]. If there is no [base]
* or if the the base URL does not allow the parts that would be added
* (e.g. if the anonymous URL possesses a fragment and [base] does not
* allow that), this will fail, and the function raises [Malformed_URL].
*)
val file_url_of_local_path : ?getcwd:(unit -> string) -> string -> url
(** Generates a URL with "file" scheme from the passed path name. The
* URL is always absolute, i.e. the current directory is prepended if the
* path is not absolute.
*
* Note that no character set conversions are performed.
*
* Win32: The input path name may use forward or backward slashes.
* Absolute paths with drive letters and UNC paths are recognised.
* Relative paths with drive letters, however, are not recognised
* (e.g. ["c:file"]), as it is not possible to access the drive-specific
* working directory from the O'Caml runtime.
*
* Cygwin: The input path name may use forward or backward slashes.
* Absolute paths with drive letters and UNC paths are recognised.
* The former are translated to ["/cygdrive"] names.
*
* @param getcwd The function returns the path taken as current working
* directory. Note that for
* Win32 this must be either an absolute name with drive letter,
* or an UNC path. Default: [Sys.getcwd]
*)
val local_path_of_file_url : url -> string
(** Extracts the path from an absolute file URL, and returns a
* correct path name.
*
* If the URL is not a file URL, or is not absolute, the function will
* fail.
*
* Win32: The URL must either contain a drive letter, or must refer
* to another host.
*
* Cygwin: Drive letters and remote URLs are recognised.
*)
val print_url : url -> unit
(** Printer for the toploop. *)
(* ---------------------------------------------------------------------- *)
(* Special accessors *)
(*
val mailto_recipients : url -> Netaddress.mailbox list
- Returns all recipients, incl the recpients in the "to" header
val ftp_type : url -> XXX
- Returns the FTP transfer type
val auth_param : url -> XXX
- if the URL has an AUTH style [user_param] the value is returned
val data_content_type : url -> XXX
val data_contents : url -> XXX
- Decomposes "data" URLs
*)
(* ---------------------------------------------------------------------- *)
(* EXAMPLES:
*
* let http = Hashtbl.find common_url_syntax "http";;
* let u = url_of_string http "http://g:pw@host/a/%62/";;
* string_of_url u;;
* --> "http://g:pw@host/a/%62/"
* url_scheme u;;
* --> "http"
* url_user u;;
* --> "g"
* url_password u;;
* --> "pw"
* url_host u;;
* --> "host"
* url_path u;;
* --> [ ""; "a"; "b"; "" ] (* sic! *)
* url_path ~encoded:true u;;
* --> [ ""; "a"; "%62"; "" ]
* let v = make_url
* ~path:[ ".."; "c" ]
* ~fragment:"near-the-#-character"
* { (partial_url_syntax http) with url_enable_fragment = Url_part_allowed };;
* string_of_url v;;
* --> "../c#near-the-%23-character"
* let u' = modify_url ~syntax:(url_syntax_of_url v) u;;
* (* u does not permit fragments *)
* let w = apply_relative_url u' v;;
* string_of_url w;;
* --> "http://g:pw@host/c#near-the-%23-character"
*)
(* ---------------------------------------------------------------------- *)