Plasma GitLab Archive
Projects Blog Knowledge

(* $Id: netmime.mli 1003 2006-09-24 15:17:15Z gerd $
 * ----------------------------------------------------------------------
 *
 *)

(** Netmime contains high-level classes and functions to process
 * mail and MIME messages.
 *
 * {b Contents}
 *
 * - {!Netmime.types}
 * - {!Netmime.classes}
 * - {!Netmime.parsing}
 * - {!Netmime.printing}
 *
 * The tutorial has been moved to {!Netmime_tut}.
 *)

(* ***************************** Types ******************************** *)

(** {1:types Types} *)

open Netchannels

type store =
  [ `Memory
  | `File of string
      (* The string is the filename of a file containing the (decoded) value
       * of the body
       *)
  ]
  (** Specifies where to store the body of a mail message. [`Memory]
   * means in-memory, [`File name] means in the file [name]. The body
   * is stored in decoded form (i.e. without transfer encoding).
   *)

exception Immutable of string
  (** Raised if it is tried to modify a read-only value. The string denotes
   * the function or method where the incident happened.
   *)


(** MIME headers and bodies are defined in two steps. First the subtype
 * describing read access is defined ([mime_header_ro], and [mime_body_ro]),
 * and after that the full class type including write access is defined
 * ([mime_header], and [mime_body]).
 *
 * The idea is that you can write functions that take an ro value as
 * input to indicate that they do not modify the value. For example:
 * 
 * {[
 * let number_of_fields (h:#mime_header_ro) =
 *   List.length (h#fields) ]}
 *
 * This function accepts both [mime_header], and [mime_header_ro] values as
 * input, but the typing ensures that the function cannot mutate anything.
 *
 * There is another way to ensure that a header or body is not modified.
 * The read-only flag can be set when creating the object, and this flag
 * causes that all trials to modify the value will raise the exception
 * [Immutable]. Of course, such trials of mutation are only detected at
 * run-time.
 *
 * The advantage of the read-only flag is that it even works if 
 * mutation depends on a condition, but it can be ensured that this
 * condition is never true. Furthermore, typing is much simpler (getting
 * subtyping correct can be annoying).
 *)


(** This is the read-only version of a MIME header. There are only methods
 * to read the header fields.
 *)
class type mime_header_ro =             
object
  (* read-only view of a mime_header *)

  method fields : (string * string) list
  method field  : string -> string
  method multiple_field : string -> string list
    (** The current fields of the header. [fields] returns the complete
     * header. [field name] returns the value of the field, or raises
     * [Not_found]. [multiple_field name] returns all fields with the same
     * name.
     *
     * Note that field names are case-insensitive; [field "content-length"],
     * and [field "CONTENT-LENGTH"] will return the same field. However,
     * the method [fields] returns the original field names, without
     * adjustment of the case.
     *
     * The order of the fields is preserved.
     *)

  (* --------------------- Standard fields ----------------------- *)

  (** Access methods for frequent standard fields.
   *
   * These methods will raise [Not_found] if the fields are not
   * present.
   *)

  method content_length : unit -> int
    (** Returns the Content-length field as integer *)

  method content_type : 
           unit -> (string * (string * Mimestring.s_param)list)
    (** Returns the Content-type as parsed value. The left value of the
     * pair is the main type, and the right value is the list of 
     * parameters. For example, for the field value
     * ["text/plain; charset=utf-8"] this method returns
     * [("text/plain", ["charset", p])] where [p] is an opaque value
     * with [Mimestring.param_value p = "utf-8"]. 
     *)

  method content_disposition : 
           unit -> (string * (string * Mimestring.s_param)list)
    (** Returns the Content-disposition field as parsed value. The
     * left value is the main disposition, and the right value is the
     * list of parameters. For example, for the field value
     * ["attachment; filename=xy.dat"] this method returns
     * [("attachment", ["filename", p])] where [p] is an opaque value
     * with [Mimestring.param_value p = "xy.dat"].
     *)

  method content_transfer_encoding : unit -> string
    (** Returns the Content-transfer-encoding as string *)
end


(** A MIME header with both read and write method. It is still possible,
 * however, to set the read-only flag to make this kind of header
 * immutable, too.
 *)
class type mime_header = 
object
  (* A mutable or immutable mime_header *)

  inherit mime_header_ro
    (** Supports all these read access method, too *)

  method ro : bool
    (** whether the header is read-only or not *)

  method set_fields : (string * string) list -> unit
  method update_field : string -> string -> unit
  method update_multiple_field : string -> string list -> unit
  method delete_field : string -> unit
    (** These methods modify the fields of the header. If the header is
     * read-only, the exception [Immutable] will be raised.
     *
     * [set_fields] replaces the current fields with a new list of
     * (name,value) pairs. [update_field name value] replaces all fields
     * of the passed name with the single setting (name,value), or
     * adds this setting to the list. [update_multiple_field name values]
     * replaces all fields of the passed name with the list of values,
     * or adds this list. Finally, [delete_field name] deletes all
     * fields of the passed name. Nothing happens if there is no such
     * field.
     *
     * Both [update_field] and [update_multiple_field] first replace
     * existing values by the new ones without changing the order
     * of the fields in the header. Additional values are inserted
     * after the last existing value, or at the end of the header.
     *)

end


(** This is the read-only version of a MIME body. There are only methods
 * to read the body contents.
 *
 * The value of the body can be returned either as [string], or as
 * object channel. Both ways are possible independently of where
 * the value is stored, in-memory, or as external file.
 *)
class type mime_body_ro =
object
  (* a read-only view of a mime_body *)

  method value : string
    (** The [value] method returns the _decoded_ body,
     * i.e. transfer encodings are removed before the value is passed
     * back.
     *
     * When the body is stored in an external file, this method
     * reads the complete file into memory.
     *)

  method store : store
    (** Where the body is actually stored. *)

  method open_value_rd : unit -> in_obj_channel
    (** Opens the value for reading. This works independently of where
     * the body is stored. For example, to read the body line by line:
     * {[
     * let ch = body # open_value_rd () in
     * try
     *   while true do
     *     let line = ch # input_line() in
     *     ... (* do something *)
     *   done;
     *   assert false; (* never reached *)
     * with
     *   End_of_file ->
     *     ch # close_in()
     * ]}
     *
     * As [value], this method returns the value in decoded form.
     * This method is quite economical with the resources, and takes
     * only as much memory as needed for the channel operations.
     *)

  method finalize : unit -> unit
    (** After the body has been finalized, it cannot be accessed any longer.
     * External resources (files) are deallocated, if they are seen as
     * temporary.
     *)
end


(** A MIME body with both read and write method. It is still possible,
 * however, to set the read-only flag to make this kind of body
 * immutable, too.
 *
 * The value of the body can be set either by a [string], or by writing
 * to an object channel. Both ways are possible independently of where
 * the value is stored, in-memory, or as external file.
 *)
class type mime_body =
object
  (* A mutable or immutable mime_body *)

  inherit mime_body_ro
    (** Supports all these read access method, too *)

  method ro : bool
    (** whether this body is read-only or not *)

  method set_value : string -> unit
    (** Sets the value. If the value is immutable, the exception
     * [Immutable] will be raised.
     *
     * The passed string must be in decoded form. When the body is
     * stored in an external file, the file is overwritten.
     *)

  method open_value_wr : unit -> out_obj_channel
    (** Opens the value for writing. The current value is overwritten. 
     * If the value is immutable, the exception [Immutable] will be raised.
     *
     * For example, to copy the file [f] into the value:
     * {[ 
     * let ch = body # open_value_wr() in
     * let f_ch = new Netchannels.input_file f in
     * ch # output_channel f_ch;
     * f_ch # close_in();
     * ch # close_out();
     * ]}
     * 
     *)

end


(** One can consider the pair [(mime_header, mime_body)] as simple MIME
 * message with one header and one body. Of course, this simple representation
 * does not support multi-part messages (attachments). For that reason,
 * the [complex_mime_message] was invented: The body can be further
 * structured as a sequence of parts that are complex messages themselves.
 *
 * For example, a mail message with an attachment is usually
 * represented as
 * {[
 * (mail_header, `Parts [ (main_header, `Body main_body);
 *                        (att_header, `Body att_body) ] ) ]}
 *
 * Here, [mail_header] is the real header of the mail message.
 * [main_header] is the header of the main message, usually
 * only containing the content type of [main_body], the body
 * of the main message. The attachment has also its own 
 * [att_header], again usually only containing the content type,
 * and the data of the attachment can be found in [att_body].
 *
 * Nowadays, mails have often even a more complicated structure
 * with [`Parts] containing nested [`Parts]. As [complex_mime_message]
 * is recursive, any kind of nesting can be easily represented.
 *)

type complex_mime_message = mime_header * complex_mime_body
and complex_mime_body =
  [ `Body of mime_body
  | `Parts of complex_mime_message list
  ]
  (* A complex_mime_message can have (nested) multipart structure. *)

type complex_mime_message_ro = mime_header_ro * complex_mime_body_ro
and complex_mime_body_ro =
  [ `Body of mime_body_ro
  | `Parts of complex_mime_message_ro list
  ]
  (** The read-only view of a complex_mime_message *)


(** Note: [`Parts []], i.e. [`Parts] together with an empty list, is 
 * considered as illegal. Such a value cannot be transformed into
 * printable text.
 *)


type mime_message = mime_header * [ `Body of mime_body ]
  (** Simple MIME message, in a form that is compatible with complex
   * ones.
   *)

type mime_message_ro = mime_header_ro * [ `Body of mime_body_ro ]
  (** Read-only variant of simple messages *)


(* ************************* Representations ************************** *)

(** {1:classes Classes} *)

class basic_mime_header : ?ro:bool -> (string * string) list -> mime_header
  (** An implementation of [mime_header].
   *
   * The argument is the list of (name,value) pairs of the header. 
   *
   * Example: Create a MIME header with only the field "Content-type":
   * {[ let h = new basic_mime_header ["content-type", "text/plain"] ]}
   *
   * Example: Set the field "Subject":
   * {[ h # update_field "subject" "The value of this field" ]}
   *
   * This [mime_header] implementation bases on a mixture of a [Map] data
   * structure and a doubly linked list. The efficiency of the operations
   * (n=number of fields; m=average number of values per field; 
   * n*m=total number of values):
   * - [new], [set_fields]: O(m * n * log n), but the construction of the dictionary
   *   is deferred until the first real access
   * - [field]: O(log n)
   * - [multiple_field]: O(log n + m)
   * - [fields]: O(n * m)
   * - [update_field], [update_multiple_field]: O(log n + m)
   * - [delete_field]: O(n + m)
   *
   * @param ro whether the header is read-only (default: false)
   *)

class memory_mime_body : ?ro:bool -> string -> mime_body
  (** An implementation of [mime_body] where the value is stored
   * in-memory.
   *
   * The argument is the initial (decoded) value of the body.
   * The method [store] returns [`Memory].
   *
   * Example: To create a body from a string, call
   * {[ new memory_mime_body "The value as string" ]}
   *
   * @param ro whether the body is read-only (default: false)
   *)


class file_mime_body : ?ro:bool -> ?fin:bool -> string -> mime_body
  (** An implementation of [mime_body] where the value is stored
   * in an external file.
   *
   * The argument is the name of the file containing the (decoded) value. 
   * The method [store] returns [`File filename].
   * The method [value] loads the contents of the file and returns them
   * as string.
   *
   * Example: To create a body from the file "f", call
   * {[ new file_mime_body "f" ]}
   *
   * @param ro whether the body is read-only (default: false)
   * @param fin whether to delete the file when the [finalize] method is called
   *   (default: false)
   *)

(* ******************************************************************** *)

(** {1:parsing Parsing MIME messages} *)

val read_mime_header :
      ?unfold:bool ->                        (* default: false *)
      ?strip:bool ->                         (* default: true *)
      ?ro:bool ->                            (* default: false *)
      Netstream.in_obj_stream -> 
	mime_header
  (** Decodes the MIME header that begins at the current position of the
   * netstream, and returns the header as class [basic_mime_header].
   * After returning, the stream is advanced to the byte following the 
   * empty line terminating the header.
   *
   * Example: To read the header at the beginning of the file "f", use:
   * {[ 
   * let ch = new Netchannels.input_channel (open_in "f") in
   * let stream = new Netstream.input_stream ch in
   * let h = read_mime_header stream in
   * ...
   * stream#close_in();    (* no need to close ch *)
   * ]}
   *
   * Note that although the [stream] position after parsing is exactly 
   * known, the position of [ch] cannot be predicted.
   *
   * @param unfold whether linefeeds are replaced by spaces in the values of the
   *   header fields (Note: defaults to [false] here in contrast to
   *   [Mimestring.scan_header]!)
   * @param strip whether whitespace at the beginning and at the end of the 
   *   header fields is stripped
   * @param ro whether the returned header is read-only (default: false)
   *)

(** Hint: To write the header [h] into the channel [ch], use
 * {[ Mimestring.write_header ch h#fields ]}
 *
 * Link: {!Mimestring.write_header}
 *)

type multipart_style = [ `None | `Flat | `Deep ]
  (** How to parse multipart messages:
   * - [`None]: Do not handle multipart messages specially. Multipart bodies
   *    are not further decoded, and returned as [`Body b] where [b] is
   *    the transfer-encoded text representation.
   * - [`Flat]: If the top-level message is a multipart message, the parts
   *    are separated and returned as list. If the parts are again multipart
   *    messages, these inner multipart messages are not furher decoded 
   *    and returned as [`Body b].
   * - [`Deep]: Multipart messages are recursively decoded and returned as
   *    tree structure.
   *
   * This value determines how far the [complex_mime_message] structure
   * is created for a parsed MIME message. [`None] means that no parts
   * are decoded, and messages have always only a simple [`Body b],
   * even if [b] is in reality a multi-part body. With [`Flat], the
   * top-level multi-part bodies are decoded (if found), and messages
   * can have a structured [`Parts [_, `Body b1; _, `Body b1; ...]]
   * body. Finally, [`Deep] allows that inner multi-part bodies are
   * recursively decoded, and messages can have an arbitrarily complex
   * form.
   *)

val decode_mime_body : #mime_header_ro -> out_obj_channel -> out_obj_channel
  (** [let ch' = decode_mime_body hdr ch]:
   * According to the value of the Content-transfer-encoding header field
   * in [hdr] the encoded MIME body written to [ch'] is decoded and transferred
   * to [ch].
   * 
   * Handles 7bit, 8bit, binary, quoted-printable, base64.
   *
   * Example: The file "f" contains base64-encoded data, and is to be decoded 
   * and to be stored in "g":
   *
   * {[ 
   * let ch_f = new Netchannels.input_channel (open_in "f") in
   * let ch_g = new Netchannels.output_channel (open_out "g") in
   * let hdr = new basic_mime_header ["content-transfer-encoding", "base64" ] in
   * let ch = decode_mime_body hdr ch_g in
   * ch # output_channel ch_f;
   * ch # close_out();
   * ch_g # close_out();
   * ch_f # close_in();
   * ]}
   *
   * Note: This function is internally used by [read_mime_message] to
   * decode bodies. There is usually no need to call it directly.
   *)


val storage : ?ro:bool -> ?fin:bool -> store -> (mime_body * out_obj_channel)
  (** Creates a new storage facility for a mime body according to [store].
   * This function can be used to build the [storage_style] argument 
   * of the class [read_mime_message] (below). For example, this is
   * useful to store large attachments in external files, as in:
   *
   * {[ 
   * let storage_style hdr = 
   *   let filename = hdr ... (* extract from hdr *) in
   *   storage (`File filename)
   * ]}
   *
   * @param ro whether the returned mime_bodies are read-only or not. Note that
   *   it is always possible to write into the body using the returned
   *   out_obj_channel regardless of the value of ~ro.
   *   Default: false
   * @param fin whether to finalize bodies stored in files.
   *   Default: false
   *)

val read_mime_message : 
      ?unfold:bool ->                                     (* Default: false *)
      ?strip:bool ->                                      (* default: true *)
      ?ro:bool ->                                         (* Default: false *)
      ?multipart_style:multipart_style ->                 (* Default: `Deep *)
      ?storage_style:(mime_header -> (mime_body * out_obj_channel)) ->
      Netstream.in_obj_stream -> 
        complex_mime_message
  (** Decodes the MIME message that begins at the current position of the
   * passed netstream. It is expected that the message continues until
   * EOF of the netstream.
   *
   * Multipart messages are decoded as specified by [multipart_style] (see
   * above).
   *
   * Message bodies with content-transfer-encodings of 7bit, 8bit, binary,
   * base64, and quoted-printable can be processed. The bodies are stored
   * without content-transfer-encoding (i.e. in decoded form), but the
   * content-transfer-encoding header field is not removed from the header.
   *
   * The [storage_style] function determines where every message body is
   * stored. The corresponding header of the body is passed to the function
   * as argument; the result of the function is a pair of a new [mime_body]
   * and an [out_obj_channel] writing into this body. You can create such a
   * pair by calling [storage] (above).
   *
   * By default, the [storage_style] is [storage ?ro `Memory] for every header. 
   * Here, the designator [`Memory] means that the body will be stored in an
   * O'Caml string. The designator [`File fn] would mean that the body will be stored in the
   * file [fn]. The file would be created if it did not yet exist, and
   * it would be overwritten if it did already exist.
   *
   * Note that the [storage_style] function is called for every non-multipart
   * body part.
   *
   * Large message bodies (> maximum string length) are supported if the
   * bodies are stored in files. The memory consumption is optimized for
   * this case, and usually only a small constant amount of memory is needed.
   *
   * Example:
   *
   * Parse the MIME message stored in the file f:
   *
   * {[
   * let m = read_mime_message 
   *           (new input_stream (new input_channel (open_in f)))
   * ]}
   *
   * @param unfold whether linefeeds are replaced by spaces in the values of the
   *   header fields (Note: defaults to [false] here in contrast to
   *   {!Mimestring.scan_header}!)
   * @param strip whether whitespace at the beginning and at the end of the 
   *   header fields is stripped
   * @param ro Whether the created MIME headers are read-only or not. Furthermore,
   *   the default [storage_style] uses this parameter for the MIME bodies, too.
   *   However, the MIME bodies may have a different read-only flag in general.
   *
   *)

  (* TODO: what about messages with type "message/*"? It may be possible that
   * they can be recursively decoded, but it is also legal for some media
   * types that they are "partial".
   * Currently the type "message/*" is NOT decoded.
   *)

(** {1:printing Printing MIME Messages} *)

val encode_mime_body : ?crlf:bool -> #mime_header_ro -> out_obj_channel -> out_obj_channel
  (** [let ch' = encode_mime_body hdr ch]:
   * According to the value of the Content-transfer-encoding header field
   * in [hdr] the unencoded MIME body written to ch' is encoded and transferred
   * to ch.
   *
   * Handles 7bit, 8bit, binary, quoted-printable, base64.
   *
   * For an example, see [decode_mime_body] which works in a similar way
   * but performs decoding instead of encoding.
   *
   * @param crlf if set (this is by default the case) CR/LF will be used for
   *   end-of-line (eol) termination, if not set LF will be used. For 7bit, 8bit and
   *   binary encoding the existing eol delimiters are not rewritten, so this option
   *   has only an effect for quoted-printable and base64.
   *)


val write_mime_message :
      ?wr_header:bool ->                       (* default: true *)
      ?wr_body:bool ->                         (* default: true *)
      ?nr:int ->                               (* default: 0 *)
      ?ret_boundary:string ref ->              (* default: do not return it *)
      ?crlf:bool ->                            (* default: true *)
      Netchannels.out_obj_channel ->
      complex_mime_message ->
        unit
  (** Writes the MIME message to the output channel. The content-transfer-
   * encoding of the leaves is respected, and their bodies are encoded
   * accordingly. The content-transfer-encoding of multipart messages is
   * always "fixed", i.e. set to "7bit", "8bit", or "binary" depending
   * on the contents.
   *
   * The function fails if multipart messages do not have a multipart
   * content type field (i.e. the content type does not begin with "multipart").
   * If only the boundary parameter is missing, a good boundary parameter is
   * added to the content type. "Good" means here that it is impossible
   * that the boundary string occurs in the message body if the
   * content-transfer-encoding is quoted-printable or base64, and that
   * such an occurrence is very unlikely if the body is not encoded.
   * If the whole content type field is missing, a "multipart/mixed" type
   * with a boundary parameter is added to the printed header.
   *
   * Note that already existing boundaries are used, no matter whether
   * they are of good quality or not.
   *
   * No other header fields are added, deleted or modified. The mentioned
   * modifications are _not_ written back to the passed MIME message but
   * only added to the generated message text.
   *
   * It is possible in some cases that the boundary does not work (both
   * the existing boundary, and the added boundary). This causes that a wrong
   * and unparseable MIME message is written. In order to ensure a correct
   * MIME message, it is recommended to parse the written text, and to compare
   * the structure of the message trees. It is, however, very unlikely that
   * a problem arises.
   *
   * Note that if the passed message is a simple message like (_,`Body _),
   * and if no content-transfer-encoding is set, the written message might
   * not end with a linefeed character.
   *
   * @param wr_header If true, the outermost header is written. Inner headers
   *   of the message parts are written unless ~wr_body=false.
   * @param wr_body If true, the body of the whole message is written; if false,
   *   no body is written at all.
   * @param nr This argument sets the counter that is included in generated
   *   boundaries to a certain minimum value.
   * @param ret_boundary if passed, the boundary of the outermost multipart
   *   message is written to this reference. (Internally used.)
   * @param crlf if set (this is by default the case) CR/LF will be used for
   *   end-of-line (eol) termination, if not set LF will be used. The eol 
   *   separator is used for the header, the multipart framing, and for
   *   bodies encoded as quoted-printable or base64. Other eol separators are
   *   left untouched.
   *)

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml