Plasma GitLab Archive
Projects Blog Knowledge

(* $Id: nethtml.mli 1296 2009-11-18 13:27:41Z ChriS $
 * ----------------------------------------------------------------------
 *
 *)

(** Parsing of HTML *)


(** The type [document] represents parsed HTML documents:
 *
 * {ul
 * {- [Element (name, args, subnodes)] is an element node for an element of
 *   type [name] (i.e. written [<name ...>...</name>]) with arguments [args]
 *   and subnodes [subnodes] (the material within the element). The arguments
 *   are simply name/value pairs. Entity references (something like [&xy;])
 *   occuring in the values are {b not} resolved.
 *
 *   Arguments without values (e.g. [<select name="x" multiple>]: here,
 *   [multiple] is such an argument) are represented as [(name,name)], i.e. the
 *   name is also returned as value.
 *
 *   As argument names are case-insensitive, the names are all lowercase.}
 * {- [Data s] is a character data node. Again, entity references are contained
 *   as such and not as what they mean.}
 * }
 *
 * Character encodings: The parser is restricted to ASCII-compatible
 * encodings (see the function {!Netconversion.is_ascii_compatible} for
 * a definition). In order to read other encodings, the text must be
 * first recoded to an ASCII-compatible encoding (example below).
 * Names of elements and attributes must additionally be ASCII-only.
 *)
type document =
    Element of (string  *  (string*string) list  *  document list)
  | Data of string
;;


(** We also need a type that declares how to handle the various tags.
 * This is called a "simplified DTD", as it is derived from SGML DTDs,
 * but simplified to the extent used in the HTML definition.
 *)

(* Now follows the type definition of simplified DTDs. *)

type element_class =         (* What is the class of an element? *)
  [ `Inline
  | `Block
  | `Essential_block
  | `None
  | `Everywhere
  ]
;;
(** Element classes are a property used in the HTML DTD. For our purposes,
 * we define element classes simply as an enumeration:
 * - [`Inline] is the class of inline HTML elements
 * - [`Block] is the class of block HTML elements
 * - [`Essential_block] is a sub-class of [`Block] with the additional
 *   property that every start tag must be explicitly ended
 * - [`None] means that the members of the class are neither block nor
 *   inline elements, but have to be handled specially
 * - [`Everywhere] means that the members of the class can occur everywhere, 
 *   regardless of whether a constraint allows it or not.
 *)


type model_constraint =      (* The constraint the subelements must fulfill *)
  [ `Inline
  | `Block
  | `Flow                                            (* = `Inline or `Block *)
  | `Empty
  | `Any
  | `Special
  | `Elements of string list             (* Enumeration of allowed elements *)
  | `Or of (model_constraint * model_constraint)
  | `Except of (model_constraint * model_constraint)
  | `Sub_exclusions of (string list * model_constraint)
  ]
;;
(** Model constraints define the possible sub elements of an element:
 * - [`Inline]: The sub elements must belong to the class [`Inline]
 * - [`Block]: The sub elements must be members of the classes [`Block] or 
 *   [`Essential_block]
 * - [`Flow]: The sub elements must belong to the classes [`Inline], [`Block],
 *   or [`Essential_block]
 * - [`Empty]: There are no sub elements
 * - [`Any]: Any sub element is allowed
 * - [`Special]: The element has special content (e.g. [<script>]).
 *   Functionally equivalent to [`Empty]
 * - [`Elements l]: Only these enumerated elements may occur as sub elements
 * - [`Or(m1,m2)]: One of the constraints [m1] or [m2] must hold
 * - [`Except(m1,m2)]: The constraint [m1] must hold, and [m2] must not hold
 * - [`Sub_exclusions(l,m)]: The constraint [m] must hold; furthermore, 
 *   the elements enumerated in list [l] are not allowed as direct or
 *   indirect subelements, even if [m] or the model of a subelement would
 *   allow them. The difference to [`Except(m, `Elements l)] is that the
 *   exclusion is inherited to the subelements. The [`Sub_exclusions]
 *   expression must be toplevel, i.e. it must not occur within an [`Or], 
 *   [`Except], or another ['Sub_exclusions] expression.
 *
 * Note that the members of the class [`Everywhere] are allowed everywhere,
 * regardless of whether the model constraint allows them or not.
 *
 * Note that certain aspects are not modeled:
 * - [#PCDATA]: We do not specify where PCDATA is allowed and where not.
 * - Order, Number: We do neither specify in which order the sub elements must
 *   occur nor how often they can occur
 * - Inclusions: DTDs may describe that an element extraordinarily
 *   allows a list of elements in all sub elements. 
 * - Optional tags: Whether start or end tags can be omitted (to some extent,
 *   this can be expressed with [`Essential_block], however)
 *)

type simplified_dtd =
    (string * (element_class * model_constraint)) list;;
(** A [simplified_dtd] is an associative list of tuples
 *  [(element_name, (element_class, constraint))]: For every [element_name]
 *  it is declared that it is a member of [element_class], and that
 *  the sub elements must satisfy [constraint].
 *
 *  It is not allowed to have several entries for the same element.
 *)

val html40_dtd : simplified_dtd
  (** The (transitional) HTML 4.0 DTD, expressed as [simplified_dtd] *)

val relaxed_html40_dtd : simplified_dtd
  (** A relaxed version of the HTML 4.0 DTD that matches better common
   * practice. In particular, this DTD additionally allows that inline
   * elements may span blocks. For example, 
   * {[ <B>text1 <P>text2 ]}
   * is parsed as
   * {[ <B>text1 <P>text2</P></B> ]}
   * and not as
   * {[ <B>text1 </B><P>text2</P> ]}
   * \- the latter is more correct (and parsed by [html40_dtd]), but is not what
   * users expect.
   *
   * Note that this is still not what many browsers implement. For example,
   * Netscape treats most inline tags specially: [<B>] switches bold on,
   * [</B>] switches bold off. For example,
   * {[ <A href='a'>text1<B>text2<A href='b'>text3 ]}
   * is parsed as
   * {[ <A href='a'>text1<B>text2</B></A><B><A href='b'>text3</A></B> ]}
   * \- there is an extra [B] element around the second anchor! (You can
   * see what Netscape parses by loading a page into the "Composer".)
   * IMHO it is questionable to consider inline tags as switches because
   * this is totally outside of the HTML specification, and browsers may
   * differ in that point.
   *
   * Furthermore, several elements are turned into essential blocks:
   * [TABLE], [UL], [OL], and [DL]. David Fox reported a problem with structures
   * like:
   * {[ <TABLE><TR><TD><TABLE><TR><TD>x</TD></TD></TR></TABLE>y</TD></TR></TABLE> ]}
   * i.e. the [TD] of the inner table has two end tags. Without additional
   * help, the second [</TD>] would close the outer table cell. Because of
   * this problem, tables are now essential meaning that it is not allowed
   * to implicitly add a missing [</TABLE>]; every table element has to
   * be explicitly ended. This rule seems to be what many browsers implement.
   *)

val parse_document : ?dtd:simplified_dtd ->            (* default: html40_dtd *)
                     ?return_declarations:bool ->      (* default: false *)
                     ?return_pis:bool ->               (* default: false *)
                     ?return_comments:bool ->          (* default: false *)
                     Lexing.lexbuf ->
                       document list
  (** Parses the HTML document from a [lexbuf] and returns it. 
   * 
   * @param dtd specifies the DTD to use. By default, [html40_dtd] is used which
   *   bases on the transitional HTML 4.0 DTD
   * @param return_declarations if set, the parser returns [<!...>] declarations
   *   as [Element("!",["contents",c],[])] nodes, where [c] is the string inside
   *   [<!] and [>]. - By default, declarations are skipped.
   * @param return_pis if set, the parser returns [<?...>] (or [<?...?>]) processing
   *   instructions as [Element("?",["contents",c],[])] nodes, where [c] is the
   *   string inside [<?] and [>] (or [?>]). - By default, processing instructions
   *   are skipped.
   * @param return_comments if set, the parser returns [<!--] .... [-->] comments
   *   as [Element("--",["contents",c],[])] nodes, where [c] is the string inside
   *   [<!--] and [-->]. - By default, comments are skipped.
   *)

val parse : ?dtd:simplified_dtd ->            (* default: html40_dtd *)
            ?return_declarations:bool ->      (* default: false *)
            ?return_pis:bool ->               (* default: false *)
            ?return_comments:bool ->          (* default: false *)
              Netchannels.in_obj_channel ->
                document list
  (** Parses the HTML document from an object channel and returns it.
   * For example, to parse the HTML string [s]:
   * {[
   * let ch = Netchannels.input_string s in
   * let doc = parse ch
   * ]}
   *
   * Arguments are the same as in [parse_document].
   *)

(** {b Note on XHTML}
 *
 * The parser can read XHTML, as long as the following XML features are not
 * used:
 * - Internal DTD subset, i.e. [<!DOCTYPE html ... [ ... ]>]
 * - External entities
 * - [<!\[CDATA\[]
 * - [<!\[INCLUDE\[]
 * - [<!\[IGNORE\[]
 *
 * The following XML features are ok:
 * - Processing instructions
 * - Empty elements (e.g. [<br/>]) as long as the element is declared as 
 *   [`Empty].
 *)

(** {b Note on Character Encodings}
 *
 * The parser can only read character streams that are encoded in an ASCII-
 * compatible way. For example, it is possible to read a UTF-8-encoded
 * stream, but not a UTF-16-encoded stream. All bytes between 1 and 127
 * are taken as ASCII, and other bytes are ignored (copied from input
 * to output).
 *
 * Non-ASCII-compatible streams must be recoded first. For example, to
 * read a UTF-16-encoded netchannel [ch], use:
 *
 * {[
 * let p = 
 *   new Netconversion.recoding_pipe ~in_enc:`Enc_utf16 ~out_enc:`Enc_utf8 () in
 * let ch' =
 *   new Netchannels.input_filter ch p in
 * let doc =
 *   Nethtml.parse ch' in
 * ch' # close_in();
 * ch # close_in();
 * ]}
 *)


val decode : 
      ?enc:Netconversion.encoding ->           (* default: `Enc_iso88591 *)
      ?subst:(int -> string) ->                (* default: failure *)
      ?entity_base:[ `Html | `Xml | `Empty ] -> 
      ?lookup:(string -> string) ->
      ?dtd:simplified_dtd ->
      document list -> document list
  (** Converts entities [&name;] and [&#num;] into the corresponding 
   * characters. The argument [enc] must indicate the character set of
   * the document (by default ISO-8859-1 for backwards compatibility).
   * If a character cannot be represented in this encoding, the function
   * [subst] is called (input is the Unicode code point, output is the
   * substituted string). By default, the function fails if such a 
   * character is found.
   *
   * The arg [entity_base] selects which entities can be converted
   * (see {!Netencoding.Html.decode}). The function [lookup] is called
   * for all unknown [&name;] entities. By default, this function fails.
   *
   * Declarations, processing instructions, and comments are not
   * decoded. The same also applies to elements declared as [`Special]
   * in the DTD. The [dtd] argument determines the DTD, by default
   * [html40_dtd] is assumed.
   *)

val encode : 
      ?enc:Netconversion.encoding ->           (* default: `Enc_iso88591 *)
      ?prefer_name:bool ->                     (* default: true *)
      ?dtd:simplified_dtd ->
      document list -> document list
  (** Converts problematic characters to their corresponding
   * entities. The argument [enc] must indicate the character set of
   * the document (by default ISO-8859-1 for backwards compatibility).
   * If [prefer_name], the algorithm tries to find the named entities
   * ([&name;]); otherwise only numeric entities ([&#num;]) are generated.
   * Names are preferred by default.
   * 
   * Declarations, processing instructions, and comments are not
   * encoded. The same also applies to elements declared as [`Special]
   * in the DTD. The [dtd] argument determines the DTD, by default
   * [html40_dtd] is assumed.
   *)

val map_list : (string -> string) -> document list -> document list
  (** [map_list f doclst]:
   * Applies [f] to all attribute values and data strings (except
   * the attributes of "?", "!", or "--" nodes). 
   *
   * This can be used to change the text encoding of a parsed document:
   * {[
   * let doc' = map_list String.lowercase doc
   * ]}
   * converts all text data to lowercase characters. 
   *)

type xmap_value =
  | Xmap_attribute of string * string * string
  | Xmap_data of string option * string

val xmap_list : (xmap_value -> string) -> string option ->
                   document list -> document list
  (** [xmap_list f surrounding_element_opt doclst]: Similar to [map_list],
    * the function [f] is applied to all attribute values and data strings.
    * Unlike [map_list], more information is passed to the callback function
    * [f]. This function is called with an [xmap_value] argument:
    * - [Xmap_attribute(ename,aname,aval)]: The function is called for an
    *   attribute value of element [ename]. The attribute is [aname] and
    *   has the value [aval]. The function must return the new value of
    *   the attribute (i.e. [aval']).
    * - [Xmap_data(ename_opt,data)]: The function is called for a data
    *   node surrounded by an element [ename_opt] (which is [None] if the
    *   data node is the outermost node). The string [data] is the value
    *   of the data node. The function must return the new value of the
    *   data node (i.e. [data']).
    *
    * [xmap_list] is invoked with [surrounding_element_opt] which is the
    * name of the surrounding element, or [None] if such an element does 
    * not exist, or is unknown.
   *)


val write : ?dtd:simplified_dtd ->            (* default: html40_dtd *) 
            ?xhtml:bool ->
            Netchannels.out_obj_channel ->
            document list ->
	      unit
  (** Writes the document to the output channel. No additional encoding or
   * decoding happens.
   *
   * Empty elements are written without end tag (see also optional argument
   * [xhtml]); the rest is written unabbreviated.
   *
   * Example: To write the document to a file:
   * {[
   * let f = open_out "filename" in
   * let ch = new Netchannels.output_channel f in
   * write ch doc;
   * ch # close_out()
   * ]}
   *
   * @param dtd The assumed simplified DTD, by default [html40_dtd]
   * @param xhtml makes the output compatible with XHTML 1.0 Strict by
   * closing [`Empty] tags with "/>" ([true] by default).
   *)

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml