(* $Id: mimestring.mli 1588 2011-04-28 13:59:54Z gerd $ * ---------------------------------------------------------------------- * *) (** Low-level functions to parse and print mail and MIME messages * * [Mimestring] contains a lot of functions to scan and print strings * formatted as MIME messages. For a higher-level view on this topic, * see the [Netmime] module. * * {b Contents} * - {!Mimestring.headers} * - {!Mimestring.structured_values} * - {!Mimestring.parsers_for_structured_values} * - {!Mimestring.printers_for_structured_values} * - {!Mimestring.scanning_mime} * - {!Mimestring.helpers_mime} * *) (* *********************************************************************) (* Collection of auxiliary functions to parse MIME headers *) (* *********************************************************************) (* See also the module Netmime for high-level MIME functions *) (** {1:headers Parsing and Printing Mail Headers} *) val scan_header : ?downcase:bool -> (* default: true *) ?unfold:bool -> (* default: true *) ?strip:bool -> (* default: false *) string -> start_pos:int -> end_pos:int -> ((string * string) list * int) (** [let params, header_end_pos = scan_header s start_pos end_pos]: * * Scans the mail header that begins at position [start_pos] in the string * [s] and that must end somewhere before position [end_pos]. It is intended * that in [end_pos] the character position following the end of the body of * the MIME message is passed. * * Returns the parameters of the header as [(name,value)] pairs (in * [params]), and in [header_end_pos] the position of the character following * directly after the header (i.e. after the blank line separating * the header from the body). * * The following normalizations have already been applied: * - (D) The names are converted to lowercase characters * - (U) Newline characters (CR and LF) in the middle of the header fields * have been removed * - (S) Whitespace at the beginning and at the end of field values has been * removed * * The default is to apply all three normalizations (D), (U), and (S) * (for historic reasons). The three arguments [downcase], [unfold], * and [strip] control which normalizations are performed (and for * historic reasons, too, this is not what you would expect - backwards * compatibility can sometimes be a burden): * * - If [downcase], do (D); if [not downcase], don't do (D). * - If [unfold], do (U); if [not unfold], don't do (U). * - If [unfold || strip], do (S); if [not unfold && not strip], * don't do (S) * - Defaults: [downcase], [unfold], [not strip]. * * This means that [unfold] not only removes CR/LF from the field value, * but also removes whitespace at the beginning and at the end of the * field value. [strip] causes not to remove CR/LF if it occurs * somewhere within the field value, but all whitespace (including * CR/LF) at the beginning of the field value and at the end of the * field value is still deleted. Note that if you only want (S) * you have to pass [~unfold:false] and [~strip:true]. * * The rules to postprocess mail messages in MIME format are {b not} * applied (e.g. encoding transformations as indicated by RFC 2047). * * The function fails if the header violates the header format * strongly. (Some minor deviations are tolerated, e.g. it is sufficient * to separate lines by only LF instead of CRLF.) * * {b The Format of Mail Messages} * * Messages * consist of a header and a body; the first empty line separates both * parts. The header contains lines "{i param-name}[:] {i param-value}" where * the param-name must begin on column 0 of the line, and the "[:]" * separates the name and the value. So the format is roughly: * * {[ * param1-name: param1-value * ... * paramN-name: paramN-value * _ * body ]} * * (Where "_" denotes an empty line.) * * This function wants in [start_pos] the position of the first character of * [param1-name] in the string, and in [end_pos] the position of the character * following [body]. It returns as [header_end_pos] the position where * [body] begins. Furthermore, in [params] all parameters are returned the * function finds in the header. * * {b Details} * * Note that parameter values are restricted; you cannot represent * arbitrary strings. The following problems can arise: * - Values cannot begin with whitespace characters, because there * may be an arbitrary number of whitespaces between the "[:]" and the * value. * - Values (and names of parameters, too) must only be formed of * 7 bit ASCII characters. (If this is not enough, the MIME standard * knows the extension RFC 2047 that allows that header values may * be composed of arbitrary characters of arbitrary character sets. * See below how to decode such characters in values returned by * this function.) * - Header values may be broken into several lines. Continuation * lines must begin with whitespace characters. This means that values * must not contain line breaks as semantic part of the value. * And it may mean that {i one} whitespace character is not distinguishable * from {i several} whitespace characters. * - Header lines must not be longer than 78 characters (soft limit) or * 998 characters (hard limit). Values that * would result into longer lines must be broken into several lines. * This means that you cannot represent strings that contain too few * whitespace characters. * (Note: The soft limit is to avoid that user agents have problems * with long lines. The hard limit means that transfer agents sometimes * do not transfer longer lines correctly.) * - Some old gateways pad the lines with spaces at the end of the lines. * * This implementation of a mail scanner tolerates a number of * deviations from the standard: long lines are not rejected; 8 bit * values are generally accepted; lines may be ended only with LF instead of * CRLF. * * Furthermore, the transformations (D), (U), and (S) can be performed * resulting in values that are simpler to process. * * {b Compatibility} * * This function can parse all mail headers that conform to RFC 822 or * RFC 2822. * * But there may be still problems, as RFC 822 allows some crazy * representations that are actually not used in practice. * In particular, RFC 822 allows it to use backslashes to "indicate" * that a CRLF sequence is semantically meant as line break. As this * function normally deletes CRLFs, it is not possible to recognize such * indicators in the result of the function. *) val read_header : ?downcase:bool -> ?unfold:bool -> ?strip:bool -> Netstream.in_obj_stream -> (string * string) list (** This function expects that the current position of the passed * [in_obj_stream] is the first byte of the header. The function scans the * header and returns it. After that, the stream position is after * the header and the terminating empty line (i.e. at the beginning of * the message body). * * The options [downcase], [unfold], and [strip] have the same meaning * as in [scan_header]. * * {b Example} * * To read the mail message "[file.txt]": * * {[ * let ch = new Netchannels.input_channel (open_in "file.txt") in * let stream = new Netstream.input_stream ch in * let header = read_header stream in * stream#close_in() (* no need to close ch *) * ]} *) val write_header : ?soft_eol:string -> (* default: "\r\n" *) ?eol:string -> (* default: "\r\n" *) Netchannels.out_obj_channel -> (string * string) list -> unit (** This function writes the header to the passed [out_obj_channel]. The * empty line following the header is also written. * * Exact output format: * {ul * {- The header is not folded, i.e. no additional CRLF sequences * are inserted into the header to avoid long header lines. * In order to produce correct headers, the necessary CRLF bytes * must already exist in the field values. (You can use the * function [write_value] below for this.)} * {- However, this function helps getting some details right. First, * whitespace at the beginning of field values is suppressed. * * {b Example:} * * [write_header ch ["x","Field value"; "y"," Other value"]] outputs: * {[ x: Field value\r\n * y: Other value\r\n * \r\n]}} * {- The end-of-line sequences LF, and CRLF, followed by * whitespace are replaced by the passed [soft_eol] string. If the * necessary space or tab character following the eol is missing, an * additional space character will be inserted. * * {b Example:} * * [write_header ch ["x","Field\nvalue"; "y","Other\r\n\tvalue"]] outputs: * {[ x: Field\r\n * value * y: Other\r\n * \tvalue]}} * {- Empty lines (and lines only consisting of whitespace) are suppressed * if they occur inside the header. * * {b Example:} * * [write_header ch ["x","Field\n\nvalue"]] outputs: * {[ x: Field\r\n * value]}} * {- Whitespace at the end of a header field is suppressed. One field * is separated from the next field by printing [eol] once.} * } * * These rules ensure that the printed header will be well-formed with * two exceptions: * - Long lines (> 72 characters) are neither folded nor rejected * - True 8 bit characters are neither properly encoded nor rejected * * These two problems cannot be addressed without taking the syntax * of the header fields into account. See below how to create * proper header fields from [s_token] lists. *) (* *********************************************************************) (** {1:structured_values Parsing Structured Values} *) (** The following types and functions allow it to build scanners for * structured mail and MIME values in a highly configurable way. * * {b Structured Values} * * RFC 822 (together with some other RFCs) defines lexical rules * how formal mail header values should be divided up into tokens. Formal * mail headers are those headers that are formed according to some * grammar, e.g. mail addresses or MIME types. * * Some of the characters separate phrases of the value; these are * the "special" characters. For example, '\@' is normally a special * character for mail addresses, because it separates the user name * from the domain name (as in [user\@domain]). RFC 822 defines a fixed set * of special * characters, but other RFCs use different sets. Because of this, * the following functions allow it to configure the set of special characters. * * Every sequence of characters may be embraced by double quotes, * which means that the sequence is meant as literal data item; * special characters are not recognized inside a quoted string. You may * use the backslash to insert any character (including double quotes) * verbatim into the quoted string (e.g. "He said: \"Give it to me!\""). * The sequence of a backslash character and another character is called * a quoted pair. * * Structured values may contain comments. The beginning of a comment * is indicated by '(', and the end by ')'. Comments may be nested. * Comments may contain quoted pairs. A * comment counts as if a space character were written instead of it. * * Control characters are the ASCII characters 0 to 31, and 127. * RFC 822 demands that mail headers are 7 bit ASCII strings. Because * of this, this module also counts the characters 128 to 255 as * control characters. * * Domain literals are strings embraced by '\[' and '\]'; such literals * may contain quoted pairs. Today, domain literals are used to specify * IP addresses (rare), e.g. [user\@[192.168.0.44]]. * * Every character sequence not falling in one of the above categories * is an atom (a sequence of non-special and non-control characters). * When recognized, atoms may be encoded in a character set different than * US-ASCII; such atoms are called encoded words (see RFC 2047). * * {b Scanning Using the Extended Interface} * * In order to scan a string containing a structured value, you must first * create a [mime_scanner] using the function [create_mime_scanner]. * The scanner contains the reference to the scanned string, and a * specification how the string is to be scanned. The specification * consists of the lists [specials] and [scan_options]. * * The character list [specials] specifies the set of special characters. * These are the characters that are not regarded as part of atoms, * because they work as delimiters that separate atoms (like [@] in the * above example). In addition to this, when '"', '(', and '\[' are * seen as regular characters not delimiting quoted string, comments, and * domain literals, respectively, these characters must also be added * to [specials]. In detail, these rules apply: * * {ul * {- {b Spaces:} * - If [' '] {i in} [specials]: A space character is returned as [Special ' ']. * Note that there may also be an effect on how comments are returned * (see below). * - If [' '] {i not in} [specials]: Spaces are not returned, although * they still delimit atoms. * * } * {- {b Tabs, CRs, LFs:} * - If ['\t'] {i in} [specials]: A tab character is returned as * [Special '\t']. * - If ['\t'] {i not in} [specials]: Tabs are not returned, although * they still delimit atoms. * - If ['\r'] {i in} [specials]: A CR character is returned as * [Special '\r']. * - If ['\r'] {i not in} [specials]: CRs are not returned, although * they still delimit atoms. * - If ['\n'] {i in} [specials]: A LF character is returned as * [Special '\n']. * - If ['\n'] {i not in} [specials]: LFs are not returned, although * they still delimit atoms. * * } * {- {b Comments:} * {ul * {- If ['('] {i in} [specials]: Comments are not recognized. The * character '(' is returned as [Special '('].} * {- If ['('] {i not in} [specials]: Comments are recognized. How comments * are returned, depends on the following: * + If [Return_comments] {i in} [scan_options]: Outer comments are * returned as [Comment] (note that inner comments are recognized but * are not returned as tokens) * + If otherwise [' '] {i in} [specials]: Outer comments are returned as * [Special ' '] * + Otherwise: Comments are recognized but not returned at all. * * } * } * } * {- {b Quoted strings:} * - If ['"'] {i in} [specials]: Quoted strings are not recognized, and * double quotes are returned as [Special '"']. * - If ['"'] {i not in} [specials]: Quoted strings are returned as * [QString] tokens. * * } * {- {b Domain literals:} * {ul * {- If '\[' {i in} [specials]: Domain literals are not recognized, and * left brackets are returned as [Special] '\['.} * {- If '\[' {i not in} [specials]: Domain literals are returned as * [DomainLiteral] tokens.} * } * } * } * * If recognized, quoted strings are returned as [QString s], where * [s] is the string without the embracing quotes, and with already * decoded quoted pairs. * * Control characters [c] are returned as [Control c]. * * If recognized, comments may either be returned as spaces (in the case * you are not interested in the contents of comments), or as [Comment] tokens. * The contents of comments are not further scanned; you must start a * subscanner to analyze comments as structured values. * * If recognized, domain literals are returned as [DomainLiteral s], where * [s] is the literal without brackets, and with decoded quoted pairs. * * Atoms are returned as [Atom s] where [s] is a longest sequence of * atomic characters (all characters which are neither special nor control * characters nor delimiters for substructures). If the option * [Recognize_encoded_words] is on, atoms which look like encoded words * are returned as [EncodedWord] tokens. (Important note: Neither '?' nor * '=' must be special in order to enable this functionality.) * * After the [mime_scanner] has been created, you can scan the tokens by * invoking [scan_token] which returns one token at a time, or by invoking * [scan_token_list] which returns all following tokens. * * There are two token types: [s_token] is the base type and is intended to * be used for pattern matching. [s_extended_token] is a wrapper that * additionally contains information where the token occurs. * * {b Scanning Using the Simple Interface} * * Instead of creating a [mime_scanner] and calling the scan functions, * you may also invoke [scan_structured_value]. This function returns the * list of tokens directly; however, it is restricted to [s_token]. * * {b Examples} * * - Simple address: {[ * scan_structured_value "user\@domain.com" [ '\@'; '.' ] [] * = [ Atom "user"; Special '\@'; Atom "domain"; Special '.'; Atom "com" ] * ]} * - Spaces are not returned: {[ * scan_structured_value "user \@ domain . com" [ '\@'; '.' ] [] * = [ Atom "user"; Special '\@'; Atom "domain"; Special '.'; Atom "com" ] * ]} * - Comments are not returned: {[ * scan_structured_value "user(Do you know him?)\@domain.com" [ '\@'; '.' ] [] * = [ Atom "user"; Special '\@'; Atom "domain"; Special '.'; Atom "com" ] * ]} * - Comments are indicated if requested: {[ * scan_structured_value "user(Do you know him?)\@domain.com" [ '\@'; '.' ] * [ Return_comments ] * = [ Atom "user"; Comment; Special '\@'; Atom "domain"; Special '.'; * Atom "com" ] * ]} * - Spaces are returned if special: {[ * scan_structured_value "user (Do you know him?) \@ domain . com" * [ '\@'; '.'; ' ' ] [] * = [ Atom "user"; Special ' '; Special ' '; Special ' '; Special '\@'; * Special ' '; Atom "domain"; * Special ' '; Special '.'; Special ' '; Atom "com" ] * ]} * - Both spaces and comments are requested: {[ * scan_structured_value "user (Do you know him?) \@ domain . com" * [ '\@'; '.'; ' ' ] [ Return_comments ] * = [ Atom "user"; Special ' '; Comment; Special ' '; Special '\@'; * Special ' '; Atom "domain"; * Special ' '; Special '.'; Special ' '; Atom "com" ] * ]} * - Another case: {[ * scan_structured_value "user \@ domain . com" [ '\@'; '.'; ' ' ] [] * = [ Atom "user"; Special ' '; Special '\@'; Special ' '; Atom "domain"; * Special ' '; Special '.'; Special ' '; Atom "com" ] * ]} * - '(' is special: {[ * scan_structured_value "user(Do you know him?)\@domain.com" ['\@'; '.'; '('] * [] * = [ Atom "user"; Special '('; Atom "Do"; Atom "you"; Atom "know"; * Atom "him?)"; Special '\@'; Atom "domain"; Special '.'; Atom "com" ] * ]} * - Quoted strings: {[ * scan_structured_value "\"My.name\"\@domain.com" [ '\@'; '.' ] [] * = [ QString "My.name"; Special '\@'; Atom "domain"; Special '.'; * Atom "com" ] * ]} * - Encoded words are not returned: {[ * scan_structured_value "=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?=" * [ ] [ ] * = [ Atom "=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?=" ] * ]} * - Encoded words are returned if requested: {[ * scan_structured_value "=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?=" * [ ] [ Recognize_encoded_words ] * = [ EncodedWord(("ISO-8859-1",""), "Q", "Keld_J=F8rn_Simonsen") ] * ]} *) type s_token = Atom of string | EncodedWord of ((string * string) * string * string) (** Args: [((charset,lang),encoding,encoded_word)] *) | QString of string | Control of char | Special of char | DomainLiteral of string | Comment | End (** *) (** A token may be one of: * - [QString s]: The quoted string [s], i.e a string between double * quotes. Quoted pairs are already decoded in [s]. * - [Control c]: The control character [c] (0-31, 127, 128-255) * - [Special c]: The special character [c], i.e. a character from * the [specials] list * - [DomainLiteral s]: The bracketed string [s], i.e. a string between * brackets. Quoted pairs are already decoded in [s]. * - [Comment]: A string between parentheses. This kind of token is only * generated when the option [Return_comments] is in effect. * - [EncodedWord((charset,lang),encoding,encoded_word)]: An RFC-2047 style * encoded word: [charset] is the name of the character set; [lang] is * the language specifier (from RFC 2231) or ""; [encoding] is either * "Q" or "B"; and [encoded_word] is the word encoded in [charset] and * [encoding]. This kind of token is only generated when the option * [Recognize_encoded_words] is in effect (if not, [Atom] is generated * instead). * - [Atom s]: A string which is neither quoted not bracketed nor * written in RFC 2047 notation, and which is not a control or special * character, i.e. the "rest" * - [End]: The end of the string *) type s_option = No_backslash_escaping (** Do not handle backslashes in quoted string and comments as escape * characters; backslashes are handled as normal characters. * For example: The wrong qstring ["C:\dir\file"] will be returned as * [QString "C:\dir\file"] when this option is in effect, and not as * [QString "C:dirfile"] as by default. * -- This is a common error in many MIME implementations. *) | Return_comments (** Comments are returned as token [Comment] (unless '(' is included * in the list of special characters, in which case comments are * not recognized at all). * You may get the exact location of the comment by applying * [get_pos] and [get_length] to the extended token. *) | Recognize_encoded_words (** Enables that encoded words are recognized and returned as * [EncodedWord] instead of [Atom]. *) type s_extended_token (** An opaque type containing the information of [s_token] plus: * - where the token occurs * - RFC-2047 access functions *) val get_token : s_extended_token -> s_token (** Return the [s_token] within the [s_extended_token] *) val get_decoded_word : s_extended_token -> string val get_charset : s_extended_token -> string (** Return the decoded word (the contents of the word after decoding the * "Q" or "B" representation), and the character set of the decoded word * (uppercase). * * These functions not only work for [EncodedWord]. The function * [get_decoded_word] returns for the other kinds of token: * - [Atom]: Returns the atom without decoding it * - [QString]: Returns the characters inside the double quotes, and * ensures that any quoted pairs are decoded * - [Control]: Returns the one-character string * - [Special]: Returns the one-character string * - [DomainLiteral]: Returns the characters inside the brackets, and * ensures that any quoted pairs are decoded * - [Comment]: Returns [""] * * The function [get_charset] returns ["US-ASCII"] for them. *) val get_language : s_extended_token -> string (** Returns the language if the token is an [EncodedWord], and [""] for * all other tokens. *) val get_pos : s_extended_token -> int (** Return the byte position where the token starts in the string * (the first byte has position 0) *) val get_line : s_extended_token -> int (** Return the line number where the token starts (numbering begins * usually with 1) *) val get_column : s_extended_token -> int (** Return the column of the line where the token starts (first column * is number 0) *) val get_length : s_extended_token -> int (** Return the length of the token in bytes *) val separates_adjacent_encoded_words : s_extended_token -> bool (** True iff the current token is white space (i.e. [Special ' '], * [Special '\t'], [Special '\r'] or [Special '\n']) and the last * non-white space token was [EncodedWord] and the next non-white * space token will be [EncodedWord]. * * The background of this function is that white space between * encoded words does not have a meaning, and must be ignored * by any application interpreting encoded words. *) type mime_scanner (** The opaque type of a scanner for structured values *) val create_mime_scanner : specials:char list -> scan_options:s_option list -> ?pos:int -> ?line:int -> ?column:int -> string -> mime_scanner (** Creates a new [mime_scanner] scanning the passed string. * * @param specials The list of characters recognized as special characters. * @param scan_options The list of global options modifying the behaviour * of the scanner * @param pos The position of the byte where the scanner starts in the * passed string. Defaults to 0. * @param line The line number of this first byte. Defaults to 1. * @param column The column number of this first byte. Default to 0. *) (** Note for [create_mime_scanner]: * * The optional parameters [pos], [line], [column] are intentionally placed after * [scan_options] and before the string argument, so you can specify * scanners by partially applying arguments to [create_mime_scanner] * which are not yet connected with a particular string: * {[ * let my_scanner_spec = create_mime_scanner my_specials my_options in * ... * let my_scanner = my_scanner_spec my_string in * ...]} *) val get_pos_of_scanner : mime_scanner -> int val get_line_of_scanner : mime_scanner -> int val get_column_of_scanner : mime_scanner -> int (** Return the current position, line, and column of a [mime_scanner]. * The primary purpose of these functions is to simplify switching * from one [mime_scanner] to another within a string: * * {[ * let scanner1 = create_mime_scanner ... s in * ... now scanning some tokens from s using scanner1 ... * let scanner2 = create_mime_scanner ... * ?pos:(get_pos_of_scanner scanner1) * ?line:(get_line_of_scanner scanner1) * ?column:(get_column_of_scanner scanner1) * s in * ... scanning more tokens from s using scanner2 ... ]} * * {b Restriction:} These functions are not available if the option * [Recognize_encoded_words] is on. The reason is that this option * enables look-ahead scanning; please use the location of the last * scanned token instead. * * Note: To improve the performance of switching, it is recommended to * create scanner specs in advance (see the example [my_scanner_spec] * above). *) val scan_token : mime_scanner -> (s_extended_token * s_token) (** Returns the next token, or [End] if there is no more token. The * token is returned both as extended and as normal token. *) val scan_token_list : mime_scanner -> (s_extended_token * s_token) list (** Returns all following tokens as a list (excluding [End]) *) val scan_structured_value : string -> char list -> s_option list -> s_token list (** This function is included for backwards compatibility, and for all * cases not requiring extended tokens. * * It scans the passed string according to the list of special characters * and the list of options, and returns the list of all tokens. *) val specials_rfc822 : char list val specials_rfc2045 : char list (** The sets of special characters defined by the RFCs 822 and 2045. *) (* *********************************************************************) (* Widely used scanners: *) (** {1:parsers_for_structured_values Parsing Certain Forms of Structured Values} *) val scan_encoded_text_value : string -> s_extended_token list (** Scans a "text" value. The returned token list contains only * [Special], [Atom] and [EncodedWord] tokens. * Spaces, TABs, CRs, LFs are returned (as [Special]) unless * they occur between adjacent encoded words in which case * they are suppressed. The characters '(', '\[', and '"' are also * returned as [Special] tokens, and are not interpreted as delimiters. * * For instance, this function can be used to scan the "Subject" * field of mail messages. *) val scan_value_with_parameters : string -> s_option list -> (string * (string * string) list) (** [let name, params = scan_value_with_parameters s options]: * Scans values with annotations like * [name ; p1=v1 ; p2=v2 ; ...] * For example, MIME types like "text/plain;charset=ISO-8859-1" can * be parsed. * * The values may or may not be quoted. The characters ";", "=", and * even "," are only accepted as part of values when they are quoted. * On sytax errors, the function fails. * * RFC 2231: This function supports some features of this RFC: * Continued parameter values are concatenated. For example: * * {[ * Content-Type: message/external-body; access-type=URL; * URL*0="ftp://"; * URL*1="cs.utk.edu/pub/moore/bulk-mailer/bulk-mailer.tar" ]} * * This is returned as: * {["message/external-body", * [ ("access-type", "URL"); * ("URL", "ftp://cs.utk.edu/pub/moore/bulk-mailer/bulk-mailer.tar") ] ) ]} * * However, encoded parameter values are not handled specially. The * parameter * [title*=us-ascii'en-us'This%20is%20%2A%2A%2Afun%2A%2A%2A] * would be returned as * [("title*", "us-ascii'en-us'This%20is%20%2A%2A%2Afun%2A%2A%2A")]. * Use [scan_values_with_parameters_ep] instead (see below). * * Raises [Failure] on syntax errors. *) type s_param (** The type of encoded parameters (RFC 2231) *) val param_value : s_param -> string val param_charset : s_param -> string val param_language : s_param -> string (** Return the decoded value of the parameter, the charset (uppercase), * and the language. * If the charset is not available, [""] will be returned. * If the language is not available, [""] will be returned. *) val mk_param : ?charset:string -> ?language:string -> string -> s_param (** Creates a parameter from a value (in decoded form). The parameter * may have a charset and a language. *) val print_s_param : Format.formatter -> s_param -> unit (** Prints a parameter to the formatter (as toploop printer) *) val scan_value_with_parameters_ep : string -> s_option list -> (string * (string * s_param) list) (** [let name, params = scan_value_with_parameters_ep s options]: * This version of the scanner copes with encoded parameters according * to RFC 2231. * Note: "ep" means "encoded parameters". * * Example: * [doc.html;title*=us-ascii'en-us'This%20is%20%2A%2A%2Afun%2A%2A%2A] * * The parameter [title] would be returned as: * - name is ["title"] * - value is ["This is ***fun***"] * - charset is ["US-ASCII"] * - language is ["en-us"] * * Raises [Failure] on syntax errors. *) val scan_mime_type : string -> s_option list -> (string * (string * string) list) (** [let name, params = scan_mime_type s options]: * Scans MIME types like * [text/plain; charset=iso-8859-1] * The name of the type and the names of the parameters are converted * to lower case. * * Raises [Failure] on syntax errors. *) val scan_mime_type_ep : string -> s_option list -> (string * (string * s_param) list) (** [let name, params = scan_mime_type_ep s options]: * This version copes with RFC-2231-encoded parameters. * * Raises [Failure] on syntax errors. *) val split_mime_type : string -> (string * string) (** [let (main_type, sub_type) = split_mime_type content_type]: * Splits the MIME type into main and sub type, for example * [ split_mime_type "text/plain" = ("text", "plain") ]. * The returned strings are always lowercase. * * Raises [Failure] on syntax errors. *) (* *********************************************************************) (* Write s_token list *) (** {1:printers_for_structured_values Printing Structured Values} *) exception Line_too_long (** Raised when the hard limit of the line length is exceeded *) val write_value : ?maxlen1:int -> (* Default: no max length *) ?maxlen:int -> (* Default: no max length *) ?hardmaxlen1:int -> (* Default: no hard max length *) ?hardmaxlen:int -> (* Default: no hard max length *) ?fold_qstring:bool -> (* Default: true *) ?fold_literal:bool -> (* Default: true *) ?unused:int ref -> (* Default: do not store this value *) ?hardunused:int ref -> (* Default: do not store this value *) Netchannels.out_obj_channel -> s_token list -> unit (** Writes the list of [s_token] to the [out_obj_channel]. The value * is optionally folded into several lines while writing, but this * is off by default. To enable folding, pass {b both} [maxlen1] and * [maxlen]: * The [maxlen1] parameter specifies the length of the first line * to write, the [maxlen] parameter specifies the length of the * other lines. * * If enabled, folding tries to ensure that the value is written * in several lines that are not longer as specified by * [maxlen1] and [maxlen]. The value is split into lines by inserting * "folding space" at certain locations (which is usually a linefeed * followed by a space character, see below). The following * table specifies between which tokens folding may happen: * * {[ * +=========================================================+ * 1st \ 2nd | Atom | QString | DLiteral | EncWord | Special | Spec ' '| * ==============+======+=========+==========+=========+=========+=========+ * Atom | FS | FS | FS | FS | - | F | * QString | FS | FS | FS | FS | - | F | * DomainLiteral | FS | FS | FS | FS | - | F | * EncodedWord | FS | FS | FS | FS | - | F | * Special | - | - | - | - | - | F | * Special ' ' | - | - | - | - | - | - | * ==============+======+=========+==========+=========+=========+=========+ *]} * * The table shows between which two types of tokens a space or a folding * space is inserted: * - [FS]: folding space * - [F]: linefeed without extra space * - [-]: nothing can be inserted here * * Folding space is ["\n "], i.e. only LF, not CRLF is used as end-of-line * character. The function [write_header] will convert these LF to CRLF * if needed. * * [Special '\t'] is handled like [Special ' ']. Control characters are just * printed, without folding. Comments, however, are substituted by * either space or folding space. The token [End] is ignored. * * Furthermore, folding may also happen within tokens: * - [Atom], [Control], and [Special] are never split up into parts. * They are simply printed. * - [EncodedWord]s, however, are reformatted. This especially means: * adjacent encoded words are first concatenated if possible * (same character set, same encoding, same language), and then * split up into several pieces with optimally chosen lengths. * {b Note:} Because this function gets [s_token] as input and not * [s_extended_token], it is not known whether [Special ' '] tokens * (or other whitespace) between adjacent EncodedWords must be * ignored. Because of this, [write_value] only reformats adjacent encoded * words when there is not any whitespace between them. * - [QString] may be split up in a special way unless [fold_qstring] * is set to [false]. For example, ["One Two Three"] may be split up into * three lines ["One\n Two\n \ Three"]. Because some header fields * explicitly forbid folding of quoted strings, it is possible to * set [~fold_qstring:false] (it is [true] by default). * {b Note:} Software should not rely on that the different types of * whitespace (especially space and TAB) remain intact at the * beginning of a line. Furthermore, it may also happen that * additional whitespace is added at the end of a line by the * transport layer. * - [DomainLiteral]: These are handled like [QString]. The parameter * [~fold_literal:false] turns folding off if it must be prevented, * it is [true] by default. * - [Comment]: Comments are effectively omitted! Instead of [Comment], * a space or folding space is printed. However, you can output comments * by passing sequences like [ Special "("; ...; Special ")" ]. * * It is possible to get the actual number of characters back that * can still be printed into the last line without making the line * too long. Pass an [int ref] as [unused] to get this value (it may * be negative!). Pass an * [int ref] as [hardunused] to get the number of characters that may * be printed until the hard limit is exceeded. * * The function normally does not fail when a line becomes too long, * i.e. it exceeds [maxlen1] or [maxlen]. * However, it is possible to specify a hard maximum length * ([hardmaxlen1] and [hardmaxlen]). If these are exceeded, the function * will raise [Line_too_long]. * * For electronic mail, a [maxlen] of 78 and a [hardmaxlen] of 998 is * recommended. * * {b Known Problems:} * - The reformatter for EncodedWords takes into * account that multi-byte characters must not be split up. However, * this works only when the multi-byte character set is known * to [Netconversion]. You can assume that UTF-8 and UTF-16 always * work. If the character set is not known the reformatter may * split the string at wrong positions. * - The reformatter for EncodedWords may parse the token, and if * this fails, you will get the exception [Malformed_code]. * This is only done in some special cases, however. * - The function prints spaces between adjacent atoms. Although * this is allowed in principal, other MIME implementations might fail when * there are spaces at unexpected locations. Workaround: If * no spaces are desired, concatenate adjacent atoms before * passing them to this function. * * {b Further Tips:} * - Pass ~maxlen1:0 and ~maxlen:0 to get shortest lines * - Use the reformatter for encoded words! It works well. For * example, to output a long sentence, just wrap it into * {b one} [EncodedWord]. The reformatter takes care to * fold the word into several lines. *) val param_tokens : ?maxlen:int -> (string*s_param) list -> s_token list (** Formats a parameter list. For example, * [[ "a", "b"; "c", "d" ]] is transformed to the token sequence * corresponding to [; a=b; c=d]. * If [maxlen] is specified, it is ensured that the individual * parameter (e.g. ["a=b;"]) is not longer than [maxlen-1], such that * it will fit into a line with maximum length [maxlen]. * By default, no maximum length is guaranteed. * If [maxlen] is passed, or if a parameter specifies a character * set or language, the encoding of RFC 2231 will be applied. If these * conditions are not met, the parameters will be encoded traditionally. *) val split_uri : string -> s_token list (** Splits a long URI according to the algorithm of RFC 2017. * The input string must only contain 7 bit characters, and * must be, if necessary, already be URL-encoded. *) (* *********************************************************************) (* Scanners for MIME bodies *) (** {1:scanning_mime Scanning MIME Messages} *) val scan_multipart_body : string -> start_pos:int -> end_pos:int -> boundary:string -> ((string * string) list * string) list (** [let [params1, value1; params2, value2; ...] * = scan_multipart_body s start_pos end_pos boundary]: * * Scans the string [s] that is the body of a multipart message. * The multipart message begins at position [start_pos] in [s], and * [end_pos] is the position * of the character following the message. In [boundary] the boundary string * must be passed (this is the "boundary" parameter of the multipart * MIME type, e.g. [multipart/mixed;boundary="some string"] ). * * The return value is the list of the parts, where each part * is returned as pair [(params, value)]. The left component [params] * is the list of name/value pairs of the header of the part. The * right component is the raw content of the part, i.e. if the part * is encoded ("content-transfer-encoding"), the content is returned * in the encoded representation. The caller is responsible for decoding * the content. * * The material before the first boundary and after the last * boundary is not returned. * * {b Multipart Messages} * * The MIME standard defines a way to group several message parts to * a larger message (for E-Mails this technique is known as "attaching" * files to messages); these are the so-called multipart messages. * Such messages are recognized by the major type string "multipart", * e.g. [multipart/mixed] or [multipart/form-data]. Multipart types MUST * have a [boundary] parameter because boundaries are essential for the * representation. * * Multipart messages have a format like (where "_" denotes empty lines): * {[ * ...Header... * Content-type: multipart/xyz; boundary="abc" * ...Header... * _ * Body begins here ("prologue") * --abc * ...Header part 1... * _ * ...Body part 1... * --abc * ...Header part 2... * _ * ...Body part 2 * --abc * ... * --abc-- * Epilogue ]} * * The parts are separated by boundary lines which begin with "--" and * the string passed as boundary parameter. (Note that there may follow * arbitrary text on boundary lines after "--abc".) The boundary is * chosen such that it does not occur as prefix of any line of the * inner parts of the message. * * The parts are again MIME messages, with header and body. Note * that it is explicitely allowed that the parts are even multipart * messages. * * The texts before the first boundary and after the last boundary * are ignored. * * Note that multipart messages as a whole MUST NOT be encoded. * Only the PARTS of the messages may be encoded (if they are not * multipart messages themselves). * * Please read RFC 2046 if want to know the gory details of this * brain-dead format. *) val scan_multipart_body_and_decode : string -> start_pos:int -> end_pos:int -> boundary:string -> ((string * string) list * string) list (** Same as [scan_multipart_body], but decodes the bodies of the parts * if they are encoded using the methods "base64" or "quoted printable". * Fails, if an unknown encoding is used. *) val scan_multipart_body_from_netstream : Netstream.in_obj_stream -> boundary:string -> create:((string * string) list -> 'a) -> add:('a -> Netstream.in_obj_stream -> int -> int -> unit) -> stop:('a -> unit) -> unit (** [scan_multipart_body_from_netstream s boundary create add stop]: * * Reads the MIME message from the netstream [s] block by block. The * parts are delimited by the [boundary]. * * Once a new part is detected and begins, the function [create] is * called with the MIME header as argument. The result [p] of this function * may be of any type. * * For every chunk of the part that is being read, the function [add] * is invoked: [add p s k n]. * * Here, [p] is the value returned by the [create] invocation for the * current part. [s] is the netstream. The current window of [s] contains * the read chunk completely; the chunk begins at position [k] of the * window (relative to the beginning of the window) and has a length * of [n] bytes. * * When the part has been fully read, the function [stop] is * called with [p] as argument. * * That means, for every part the following is executed: * - [let p = create h] * - [add p s k1 n1] * - [add p s k2 n2] * - ... * - [add p s kN nN] * - [stop p] * * {b Important Precondition:} * - The block size of the netstream [s] must be at least * [String.length boundary + 4] * * {b Exceptions:} * - Exceptions can happen because of ill-formed input, and within * the callbacks of the functions [create], [add], [stop]. * - If the exception happens while part [p] is being read, and the * [create] function has already been called (successfully), the * [stop] function is also called (you have the chance to close files). * The exception is re-raised after [stop] returns. *) val read_multipart_body : (Netstream.in_obj_stream -> 'a) -> string -> Netstream.in_obj_stream -> 'a list (** This is the "next generation" multipart message parser. It is * called as follows: * * [let parts = read_multipart_body f boundary s] * * As precondition, the current position of the stream [s] must be at * the beginning of the message body. The string [boundary] must * be the message boundary (without "--"). The function [f] is called * for every message part, and the resulting list [parts] is the * concatentation of the values returned by [f]. * * The stream passed to [f] is a substream of [s] that begins at the * first byte of the header of the message part. The function [f] * can read data from the substream as necessary. The substream * terminates at the end of the message part. This means that [f] can simply * read the data of the substream from the beginning to the end. It is * not necessary that [f] reads the substream until EOF, however. * * After all parts have been read, the trailing material of stream [s] * is skipped until EOF of [s] is reached. *) (* *********************************************************************) (* Tools to create multipart messages *) (** {1:helpers_mime Helpers for MIME Messages} *) val create_boundary : ?random:string list -> ?nr:int -> unit -> string (** Creates a boundary string that can be used to separate multipart * messages. * The string is 63 characters long and has the following "features": * - Most of the string consists of the minus character yielding * a clear optical effect * - The string contains "=__". This sequence cannot be obtained * by the quoted-printable encoding, so you need not to care whether * strings encoded as quoted-printable contain the boundary. * - The string contains "<&>;" which is illegal in HTML, XML, and * SGML. * - The string does not contain double quotes or backslashes, * so you can safely put double quotes around it in the MIME header. * - The string contains [nr], so you can safely distinguish between * several boundaries occurring in the same MIME body if you * assign different [nr]. * - The string contains a hash value composed of the first * 256 bytes of all strings passed as [random], and influenced * by the current GC state. *) (* THREAD-SAFETY: * The functions are thread-safe as long as the threads do not share * values. *)