(* $Id: nethtml.mli 1296 2009-11-18 13:27:41Z ChriS $
* ----------------------------------------------------------------------
*
*)
(** Parsing of HTML *)
(** The type [document] represents parsed HTML documents:
*
* {ul
* {- [Element (name, args, subnodes)] is an element node for an element of
* type [name] (i.e. written [<name ...>...</name>]) with arguments [args]
* and subnodes [subnodes] (the material within the element). The arguments
* are simply name/value pairs. Entity references (something like [&xy;])
* occuring in the values are {b not} resolved.
*
* Arguments without values (e.g. [<select name="x" multiple>]: here,
* [multiple] is such an argument) are represented as [(name,name)], i.e. the
* name is also returned as value.
*
* As argument names are case-insensitive, the names are all lowercase.}
* {- [Data s] is a character data node. Again, entity references are contained
* as such and not as what they mean.}
* }
*
* Character encodings: The parser is restricted to ASCII-compatible
* encodings (see the function {!Netconversion.is_ascii_compatible} for
* a definition). In order to read other encodings, the text must be
* first recoded to an ASCII-compatible encoding (example below).
* Names of elements and attributes must additionally be ASCII-only.
*)
type document =
Element of (string * (string*string) list * document list)
| Data of string
;;
(** We also need a type that declares how to handle the various tags.
* This is called a "simplified DTD", as it is derived from SGML DTDs,
* but simplified to the extent used in the HTML definition.
*)
(* Now follows the type definition of simplified DTDs. *)
type element_class = (* What is the class of an element? *)
[ `Inline
| `Block
| `Essential_block
| `None
| `Everywhere
]
;;
(** Element classes are a property used in the HTML DTD. For our purposes,
* we define element classes simply as an enumeration:
* - [`Inline] is the class of inline HTML elements
* - [`Block] is the class of block HTML elements
* - [`Essential_block] is a sub-class of [`Block] with the additional
* property that every start tag must be explicitly ended
* - [`None] means that the members of the class are neither block nor
* inline elements, but have to be handled specially
* - [`Everywhere] means that the members of the class can occur everywhere,
* regardless of whether a constraint allows it or not.
*)
type model_constraint = (* The constraint the subelements must fulfill *)
[ `Inline
| `Block
| `Flow (* = `Inline or `Block *)
| `Empty
| `Any
| `Special
| `Elements of string list (* Enumeration of allowed elements *)
| `Or of (model_constraint * model_constraint)
| `Except of (model_constraint * model_constraint)
| `Sub_exclusions of (string list * model_constraint)
]
;;
(** Model constraints define the possible sub elements of an element:
* - [`Inline]: The sub elements must belong to the class [`Inline]
* - [`Block]: The sub elements must be members of the classes [`Block] or
* [`Essential_block]
* - [`Flow]: The sub elements must belong to the classes [`Inline], [`Block],
* or [`Essential_block]
* - [`Empty]: There are no sub elements
* - [`Any]: Any sub element is allowed
* - [`Special]: The element has special content (e.g. [<script>]).
* Functionally equivalent to [`Empty]
* - [`Elements l]: Only these enumerated elements may occur as sub elements
* - [`Or(m1,m2)]: One of the constraints [m1] or [m2] must hold
* - [`Except(m1,m2)]: The constraint [m1] must hold, and [m2] must not hold
* - [`Sub_exclusions(l,m)]: The constraint [m] must hold; furthermore,
* the elements enumerated in list [l] are not allowed as direct or
* indirect subelements, even if [m] or the model of a subelement would
* allow them. The difference to [`Except(m, `Elements l)] is that the
* exclusion is inherited to the subelements. The [`Sub_exclusions]
* expression must be toplevel, i.e. it must not occur within an [`Or],
* [`Except], or another ['Sub_exclusions] expression.
*
* Note that the members of the class [`Everywhere] are allowed everywhere,
* regardless of whether the model constraint allows them or not.
*
* Note that certain aspects are not modeled:
* - [#PCDATA]: We do not specify where PCDATA is allowed and where not.
* - Order, Number: We do neither specify in which order the sub elements must
* occur nor how often they can occur
* - Inclusions: DTDs may describe that an element extraordinarily
* allows a list of elements in all sub elements.
* - Optional tags: Whether start or end tags can be omitted (to some extent,
* this can be expressed with [`Essential_block], however)
*)
type simplified_dtd =
(string * (element_class * model_constraint)) list;;
(** A [simplified_dtd] is an associative list of tuples
* [(element_name, (element_class, constraint))]: For every [element_name]
* it is declared that it is a member of [element_class], and that
* the sub elements must satisfy [constraint].
*
* It is not allowed to have several entries for the same element.
*)
val html40_dtd : simplified_dtd
(** The (transitional) HTML 4.0 DTD, expressed as [simplified_dtd] *)
val relaxed_html40_dtd : simplified_dtd
(** A relaxed version of the HTML 4.0 DTD that matches better common
* practice. In particular, this DTD additionally allows that inline
* elements may span blocks. For example,
* {[ <B>text1 <P>text2 ]}
* is parsed as
* {[ <B>text1 <P>text2</P></B> ]}
* and not as
* {[ <B>text1 </B><P>text2</P> ]}
* \- the latter is more correct (and parsed by [html40_dtd]), but is not what
* users expect.
*
* Note that this is still not what many browsers implement. For example,
* Netscape treats most inline tags specially: [<B>] switches bold on,
* [</B>] switches bold off. For example,
* {[ <A href='a'>text1<B>text2<A href='b'>text3 ]}
* is parsed as
* {[ <A href='a'>text1<B>text2</B></A><B><A href='b'>text3</A></B> ]}
* \- there is an extra [B] element around the second anchor! (You can
* see what Netscape parses by loading a page into the "Composer".)
* IMHO it is questionable to consider inline tags as switches because
* this is totally outside of the HTML specification, and browsers may
* differ in that point.
*
* Furthermore, several elements are turned into essential blocks:
* [TABLE], [UL], [OL], and [DL]. David Fox reported a problem with structures
* like:
* {[ <TABLE><TR><TD><TABLE><TR><TD>x</TD></TD></TR></TABLE>y</TD></TR></TABLE> ]}
* i.e. the [TD] of the inner table has two end tags. Without additional
* help, the second [</TD>] would close the outer table cell. Because of
* this problem, tables are now essential meaning that it is not allowed
* to implicitly add a missing [</TABLE>]; every table element has to
* be explicitly ended. This rule seems to be what many browsers implement.
*)
val parse_document : ?dtd:simplified_dtd -> (* default: html40_dtd *)
?return_declarations:bool -> (* default: false *)
?return_pis:bool -> (* default: false *)
?return_comments:bool -> (* default: false *)
Lexing.lexbuf ->
document list
(** Parses the HTML document from a [lexbuf] and returns it.
*
* @param dtd specifies the DTD to use. By default, [html40_dtd] is used which
* bases on the transitional HTML 4.0 DTD
* @param return_declarations if set, the parser returns [<!...>] declarations
* as [Element("!",["contents",c],[])] nodes, where [c] is the string inside
* [<!] and [>]. - By default, declarations are skipped.
* @param return_pis if set, the parser returns [<?...>] (or [<?...?>]) processing
* instructions as [Element("?",["contents",c],[])] nodes, where [c] is the
* string inside [<?] and [>] (or [?>]). - By default, processing instructions
* are skipped.
* @param return_comments if set, the parser returns [<!--] .... [-->] comments
* as [Element("--",["contents",c],[])] nodes, where [c] is the string inside
* [<!--] and [-->]. - By default, comments are skipped.
*)
val parse : ?dtd:simplified_dtd -> (* default: html40_dtd *)
?return_declarations:bool -> (* default: false *)
?return_pis:bool -> (* default: false *)
?return_comments:bool -> (* default: false *)
Netchannels.in_obj_channel ->
document list
(** Parses the HTML document from an object channel and returns it.
* For example, to parse the HTML string [s]:
* {[
* let ch = Netchannels.input_string s in
* let doc = parse ch
* ]}
*
* Arguments are the same as in [parse_document].
*)
(** {b Note on XHTML}
*
* The parser can read XHTML, as long as the following XML features are not
* used:
* - Internal DTD subset, i.e. [<!DOCTYPE html ... [ ... ]>]
* - External entities
* - [<!\[CDATA\[]
* - [<!\[INCLUDE\[]
* - [<!\[IGNORE\[]
*
* The following XML features are ok:
* - Processing instructions
* - Empty elements (e.g. [<br/>]) as long as the element is declared as
* [`Empty].
*)
(** {b Note on Character Encodings}
*
* The parser can only read character streams that are encoded in an ASCII-
* compatible way. For example, it is possible to read a UTF-8-encoded
* stream, but not a UTF-16-encoded stream. All bytes between 1 and 127
* are taken as ASCII, and other bytes are ignored (copied from input
* to output).
*
* Non-ASCII-compatible streams must be recoded first. For example, to
* read a UTF-16-encoded netchannel [ch], use:
*
* {[
* let p =
* new Netconversion.recoding_pipe ~in_enc:`Enc_utf16 ~out_enc:`Enc_utf8 () in
* let ch' =
* new Netchannels.input_filter ch p in
* let doc =
* Nethtml.parse ch' in
* ch' # close_in();
* ch # close_in();
* ]}
*)
val decode :
?enc:Netconversion.encoding -> (* default: `Enc_iso88591 *)
?subst:(int -> string) -> (* default: failure *)
?entity_base:[ `Html | `Xml | `Empty ] ->
?lookup:(string -> string) ->
?dtd:simplified_dtd ->
document list -> document list
(** Converts entities [&name;] and [&#num;] into the corresponding
* characters. The argument [enc] must indicate the character set of
* the document (by default ISO-8859-1 for backwards compatibility).
* If a character cannot be represented in this encoding, the function
* [subst] is called (input is the Unicode code point, output is the
* substituted string). By default, the function fails if such a
* character is found.
*
* The arg [entity_base] selects which entities can be converted
* (see {!Netencoding.Html.decode}). The function [lookup] is called
* for all unknown [&name;] entities. By default, this function fails.
*
* Declarations, processing instructions, and comments are not
* decoded. The same also applies to elements declared as [`Special]
* in the DTD. The [dtd] argument determines the DTD, by default
* [html40_dtd] is assumed.
*)
val encode :
?enc:Netconversion.encoding -> (* default: `Enc_iso88591 *)
?prefer_name:bool -> (* default: true *)
?dtd:simplified_dtd ->
document list -> document list
(** Converts problematic characters to their corresponding
* entities. The argument [enc] must indicate the character set of
* the document (by default ISO-8859-1 for backwards compatibility).
* If [prefer_name], the algorithm tries to find the named entities
* ([&name;]); otherwise only numeric entities ([&#num;]) are generated.
* Names are preferred by default.
*
* Declarations, processing instructions, and comments are not
* encoded. The same also applies to elements declared as [`Special]
* in the DTD. The [dtd] argument determines the DTD, by default
* [html40_dtd] is assumed.
*)
val map_list : (string -> string) -> document list -> document list
(** [map_list f doclst]:
* Applies [f] to all attribute values and data strings (except
* the attributes of "?", "!", or "--" nodes).
*
* This can be used to change the text encoding of a parsed document:
* {[
* let doc' = map_list String.lowercase doc
* ]}
* converts all text data to lowercase characters.
*)
type xmap_value =
| Xmap_attribute of string * string * string
| Xmap_data of string option * string
val xmap_list : (xmap_value -> string) -> string option ->
document list -> document list
(** [xmap_list f surrounding_element_opt doclst]: Similar to [map_list],
* the function [f] is applied to all attribute values and data strings.
* Unlike [map_list], more information is passed to the callback function
* [f]. This function is called with an [xmap_value] argument:
* - [Xmap_attribute(ename,aname,aval)]: The function is called for an
* attribute value of element [ename]. The attribute is [aname] and
* has the value [aval]. The function must return the new value of
* the attribute (i.e. [aval']).
* - [Xmap_data(ename_opt,data)]: The function is called for a data
* node surrounded by an element [ename_opt] (which is [None] if the
* data node is the outermost node). The string [data] is the value
* of the data node. The function must return the new value of the
* data node (i.e. [data']).
*
* [xmap_list] is invoked with [surrounding_element_opt] which is the
* name of the surrounding element, or [None] if such an element does
* not exist, or is unknown.
*)
val write : ?dtd:simplified_dtd -> (* default: html40_dtd *)
?xhtml:bool ->
Netchannels.out_obj_channel ->
document list ->
unit
(** Writes the document to the output channel. No additional encoding or
* decoding happens.
*
* Empty elements are written without end tag (see also optional argument
* [xhtml]); the rest is written unabbreviated.
*
* Example: To write the document to a file:
* {[
* let f = open_out "filename" in
* let ch = new Netchannels.output_channel f in
* write ch doc;
* ch # close_out()
* ]}
*
* @param dtd The assumed simplified DTD, by default [html40_dtd]
* @param xhtml makes the output compatible with XHTML 1.0 Strict by
* closing [`Empty] tags with "/>" ([true] by default).
*)