(* $Id: nethtml.ml 1296 2009-11-18 13:27:41Z ChriS $ * ---------------------------------------------------------------------- * *) open Nethtml_scanner;; type document = Element of (string * (string*string) list * document list) | Data of string ;; exception End_of_scan;; exception Found;; type element_class = (* What is the class of an element? *) [ `Inline | `Block | `Essential_block | `None | `Everywhere ] ;; type model_constraint = (* The constraint the subelements must fulfill *) [ `Inline | `Block | `Flow (* = `Inline or `Block *) | `Empty | `Any | `Special | `Elements of string list (* Enumeration of allowed elements *) | `Or of (model_constraint * model_constraint) | `Except of (model_constraint * model_constraint) | `Sub_exclusions of (string list * model_constraint) ] ;; type simplified_dtd = (string * (element_class * model_constraint)) list let ( |. ) a b = `Or(a,b);; let ( -. ) a b = `Except(a,b);; let block_elements = (* Only used for exclusions *) [ "p"; "dl"; "div"; "center"; "noscript"; "noframes"; "blockquote"; "form"; "isindex"; "hr"; "table"; "fieldset"; "address"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "pre"; "ul"; "ol"; "dir"; "menu" ];; let html40_dtd = [ (* --------- INLINE ELEMENTS ------------ *) (* %fontstyle; *) "tt", (`Inline, `Inline); "i", (`Inline, `Inline); "b", (`Inline, `Inline); "big", (`Inline, `Inline); "small", (`Inline, `Inline); (* transitional: *) "u", (`Inline, `Inline); "s", (`Inline, `Inline); "strike", (`Inline, `Inline); (* %phrase; *) "em", (`Inline, `Inline); "strong", (`Inline, `Inline); "dfn", (`Inline, `Inline); "code", (`Inline, `Inline); "samp", (`Inline, `Inline); "kbd", (`Inline, `Inline); "var", (`Inline, `Inline); "cite", (`Inline, `Inline); "abbr", (`Inline, `Inline); "acronym", (`Inline, `Inline); (* %special; *) "sup", (`Inline, `Inline); "sub", (`Inline, `Inline); "span", (`Inline, `Inline); "bdo", (`Inline, `Inline); "br", (`Inline, `Empty); "a", (`Inline, `Sub_exclusions(["a"],`Inline)); "img", (`Inline, `Empty); "object", (`Inline, (`Flow |. `Elements ["param"])); "script", (`Inline, `Special); "map", (`Inline, (`Flow |. `Elements ["area"])); "q", (`Inline, `Inline); (* transitional: *) "applet", (`Inline, (`Flow |. `Elements ["param"])); "font", (`Inline, `Inline); "basefont", (`Inline, `Empty); "iframe", (`Inline, `Flow); (* %formctrl; *) "input", (`Inline, `Empty); "select", (`Inline, `Elements ["optgroup"; "option"]); "textarea", (`Inline, `Elements []); (* #PCDATA *) "label", (`Inline, `Sub_exclusions( ["label"], `Inline)); "button", (`Inline, `Sub_exclusions( ["a"; "input"; "select"; "textarea"; "label"; "button"; "form"; "fieldset"; "isindex"; "iframe"], `Flow)); (* ------------ BLOCK ELEMENTS ----------*) "p", (`Block, `Inline); (* %heading; *) "h1", (`Block, `Inline); "h2", (`Block, `Inline); "h3", (`Block, `Inline); "h4", (`Block, `Inline); "h5", (`Block, `Inline); "h6", (`Block, `Inline); (* %list; *) "ul", (`Block, `Elements ["li"]); "ol", (`Block, `Elements ["li"]); (* transitional: *) "dir", (`Block, `Sub_exclusions( block_elements, `Elements ["li"])); "menu", (`Block, `Sub_exclusions( block_elements, `Elements ["li"])); (* %preformatted; *) "pre", (`Block, `Sub_exclusions( [ "img"; "object"; "applet"; "big"; "small"; "sub"; "sup"; "font"; "basefont"], `Inline )); (* other: *) "dl", (`Block, `Elements ["dt"; "dd"]); "div", (`Block, `Flow); "noscript", (`Block, `Flow); "blockquote", (`Block, (`Flow |. `Elements ["script"])); (* strict DTD has `Block here *) "form", (`Block, `Sub_exclusions( ["form"], `Flow |. `Elements ["script"])); (* strict DTD has `Block here *) "hr", (`Block, `Empty); "table", (`Block, `Elements ["caption"; "col"; "colgroup"; "thead"; "tfoot"; "tbody"; "tr"]); "fieldset", (`Block, (`Flow |. `Elements ["legend"])); "address", (`Block, `Inline); (* transitional: *) "center", (`Block, `Flow); "noframes", (`Block, `Flow); "isindex", (`Block, `Empty); (* ------------ OTHER ELEMENTS ----------*) "body", (`None, (`Flow |. `Elements ["script"])); (* strict DTD has `Block here *) "area", (`None, `Empty); "link", (`None, `Empty); "param", (`None, `Empty); "ins", (`Everywhere, `Flow); "del", (`Everywhere, `Flow); "dt", (`None, `Inline); "dd", (`None, `Flow); "li", (`None, `Flow); "optgroup", (`None, `Elements ["option"]); "option", (`None, `Elements []); (* #PCDATA *) "legend", (`None, `Inline); "caption", (`None, `Inline); "thead", (`None, `Elements ["tr"]); "tbody", (`None, `Elements ["tr"]); "tfoot", (`None, `Elements ["tr"]); "colgroup", (`None, `Elements ["col"]); "col", (`None, `Empty); "tr", (`None, `Elements ["th"; "td"]); "th", (`None, `Flow); "td", (`None, `Flow); "head", (`None, `Elements ["title"; "base"; "script"; "style"; "meta"; "link"; "object"]); "title", (`None, `Elements []); (* #PCDATA *) "base", (`None, `Empty); "meta", (`None, `Empty); "style", (`None, `Special); "html", (`None, (`Flow |. `Elements ["head"; "title"; "base"; "script"; "style"; "meta"; "link"; "object"; "body"; "frameset"])); (* transitional: *) "frameset", (`None, `Elements ["frameset"; "frame"; "noframes"]); "frame", (`None, `Empty); ] ;; let relax_dtd dtd = (* Changes (`Inline, `Inline) constraints into (`Inline, `Flow). *) let rec relax_model m = match m with `Inline -> `Flow | `Sub_exclusions(l,m') -> `Sub_exclusions(l,relax_model m') | other -> other in List.map (fun (name, (elclass, elconstr)) -> match elclass with `Inline -> (name, (elclass, relax_model elconstr)) | other -> (name, (elclass, elconstr)) ) dtd ;; let essential_blocks dtd elements = (* Changes the passed block elements into essential block elements *) List.map (fun (name, (elclass, elconstr)) -> match elclass with `Block when List.mem name elements -> (name, ( `Essential_block, elconstr)) | other -> (name, (elclass, elconstr)) ) dtd ;; let relaxed_html40_dtd = essential_blocks (relax_dtd html40_dtd) [ "body"; "table"; "ol"; "ul"; "dl" ] ;; let rec parse_comment buf = let t = scan_comment buf in match t with Mcomment -> let s = Lexing.lexeme buf in s ^ parse_comment buf | Eof -> raise End_of_scan | _ -> (* must be Rcomment *) "" ;; let rec parse_doctype buf = let t = scan_doctype buf in match t with Mdoctype -> let s = Lexing.lexeme buf in s ^ parse_doctype buf | Eof -> raise End_of_scan | _ -> (* must be Rdoctype *) "" ;; let rec parse_pi buf = let t = scan_pi buf in match t with Mpi -> let s = Lexing.lexeme buf in s ^ parse_pi buf | Eof -> raise End_of_scan | _ -> (* must be Rpi *) "" ;; let hashtbl_from_alist l = let ht = Hashtbl.create (List.length l) in List.iter (fun (k, v) -> Hashtbl.add ht k v) l; ht ;; module S = struct type t = string let compare = (Pervasives.compare : string -> string -> int) end module Strset = Set.Make(S);; let parse_document ?(dtd = html40_dtd) ?(return_declarations = false) ?(return_pis = false) ?(return_comments = false) buf = let current_name = ref "" in let current_atts = ref [] in let current_subs = ref [] in let current_excl = ref Strset.empty in (* current exclusions *) let stack = Stack.create() in let dtd_hash = hashtbl_from_alist dtd in let model_of element_name = if element_name = "" then (`Everywhere, `Any) else let extract = function (eclass, `Sub_exclusions(_,m)) -> eclass, m | m -> m in try extract(Hashtbl.find dtd_hash element_name) with Not_found -> (`Everywhere, `Any) in let exclusions_of element_name = if element_name = "" then [] else let extract = function (eclass, `Sub_exclusions(l,_)) -> l | _ -> [] in try extract(Hashtbl.find dtd_hash element_name) with Not_found -> [] in let is_possible_subelement parent_element parent_exclusions sub_element = let (sub_class, _) = model_of sub_element in let rec eval m = match m with `Inline -> sub_class = `Inline | `Block -> sub_class = `Block || sub_class = `Essential_block | `Flow -> sub_class = `Inline || sub_class = `Block || sub_class = `Essential_block | `Elements l -> List.mem sub_element l | `Any -> true | `Or(m1,m2) -> eval m1 || eval m2 | `Except(m1,m2) -> eval m1 && not (eval m2) | `Empty -> false | `Special -> false | `Sub_exclusions(_,_) -> assert false in (sub_class = `Everywhere) || ( (not (Strset.mem sub_element parent_exclusions)) && let (_, parent_model) = model_of parent_element in eval parent_model ) in let unwind_stack sub_name = (* If the current element is not a possible parent element for sub_name, * search the parent element in the stack. * Either the new current element is the parent, or there was no * possible parent. In the latter case, the current element is the * same element as before. *) let backup = Stack.create() in let backup_name = !current_name in let backup_atts = !current_atts in let backup_subs = !current_subs in let backup_excl = !current_excl in try while not (is_possible_subelement !current_name !current_excl sub_name) do (* Maybe we are not allowed to end the current element: *) let (current_class, _) = model_of !current_name in if current_class = `Essential_block then raise Stack.Empty; (* End the current element and remove it from the stack: *) let grant_parent = Stack.pop stack in Stack.push grant_parent backup; (* Save it; may we need it *) let (gp_name, gp_atts, gp_subs, gp_excl) = grant_parent in (* If gp_name is an essential element, we are not allowed to close * it implicitly, even if that violates the DTD. *) let current = Element (!current_name, !current_atts, List.rev !current_subs) in current_name := gp_name; current_atts := gp_atts; current_excl := gp_excl; current_subs := current :: gp_subs done; with Stack.Empty -> (* It did not work! Push everything back to the stack, and * resume the old state. *) while Stack.length backup > 0 do Stack.push (Stack.pop backup) stack done; current_name := backup_name; current_atts := backup_atts; current_subs := backup_subs; current_excl := backup_excl in let parse_atts() = let rec next_no_space p_string = (* p_string: whether string literals in quotation marks are allowed *) let tok = if p_string then scan_element_after_Is buf else scan_element buf in match tok with Space _ -> next_no_space p_string | t -> t in let rec parse_atts_lookahead next = match next with | Relement -> ( [], false ) | Relement_empty -> ( [], true ) | Name n -> ( match next_no_space false with Is -> ( match next_no_space true with Name v -> let toks, is_empty = parse_atts_lookahead (next_no_space false) in ( (String.lowercase n, v) :: toks, is_empty ) | Literal v -> let toks, is_empty = parse_atts_lookahead (next_no_space false) in ( (String.lowercase n,v) :: toks, is_empty ) | Eof -> raise End_of_scan | Relement -> (* Illegal *) ( [], false ) | Relement_empty -> (* Illegal *) ( [], true ) | _ -> (* Illegal *) parse_atts_lookahead (next_no_space false) ) | Eof -> raise End_of_scan | Relement -> (* <tag name> <==> <tag name="name"> *) ( [ String.lowercase n, String.lowercase n ], false) | Relement_empty -> (* <tag name> <==> <tag name="name"> *) ( [ String.lowercase n, String.lowercase n ], true) | next' -> (* assume <tag name ... > <==> <tag name="name" ...> *) let toks, is_empty = parse_atts_lookahead next' in ( ( String.lowercase n, String.lowercase n ) :: toks, is_empty) ) | Eof -> raise End_of_scan | _ -> (* Illegal *) parse_atts_lookahead (next_no_space false) in parse_atts_lookahead (next_no_space false) in let rec parse_special name = (* Parse until </name> *) match scan_special buf with | Lelementend n -> if String.lowercase n = name then "" else "</" ^ n ^ parse_special name | Eof -> raise End_of_scan | Cdata s -> s ^ parse_special name | _ -> (* Illegal *) parse_special name in let rec skip_element() = (* Skip until ">" (or "/>") *) match scan_element buf with | Relement | Relement_empty -> () | Eof -> raise End_of_scan | _ -> skip_element() in let rec parse_next() = let t = scan_document buf in match t with | Lcomment -> let comment = parse_comment buf in if return_comments then current_subs := (Element("--",["contents",comment],[])) :: !current_subs; parse_next() | Ldoctype -> let decl = parse_doctype buf in if return_declarations then current_subs := (Element("!",["contents",decl],[])) :: !current_subs; parse_next() | Lpi -> let pi = parse_pi buf in if return_pis then current_subs := (Element("?",["contents",pi],[])) :: !current_subs; parse_next() | Lelement name -> let name = String.lowercase name in let (_, model) = model_of name in ( match model with `Empty -> let atts, _ = parse_atts() in unwind_stack name; current_subs := (Element(name, atts, [])) :: !current_subs; parse_next() | `Special -> let atts, is_empty = parse_atts() in unwind_stack name; let data = if is_empty then "" else ( let d = parse_special name in (* Read until ">" *) skip_element(); d ) in current_subs := (Element(name, atts, [Data data])) :: !current_subs; parse_next() | _ -> let atts, is_empty = parse_atts() in (* Unwind the stack until we find an element which can be * the parent of the new element: *) unwind_stack name; if is_empty then ( (* Simple case *) current_subs := (Element(name, atts, [])) :: !current_subs; ) else ( (* Push the current element on the stack, and this element * becomes the new current element: *) let new_excl = exclusions_of name in Stack.push (!current_name, !current_atts, !current_subs, !current_excl) stack; current_name := name; current_atts := atts; current_subs := []; List.iter (fun xel -> current_excl := Strset.add xel !current_excl) new_excl; ); parse_next() ) | Cdata data -> current_subs := (Data data) :: !current_subs; parse_next() | Lelementend name -> let name = String.lowercase name in (* Read until ">" *) skip_element(); (* Search the element to close on the stack: *) let found = (name = !current_name) || try Stack.iter (fun (old_name, _, _, _) -> if name = old_name then raise Found; match model_of old_name with `Essential_block, _ -> raise Not_found; (* Don't close essential blocks implicitly *) | _ -> ()) stack; false with Found -> true | Not_found -> false in (* If not found, the end tag is wrong. Simply ignore it. *) if not found then parse_next() else begin (* If found: Remove the elements from the stack, and append * them to the previous element as sub elements *) while !current_name <> name do let old_name, old_atts, old_subs, old_excl = Stack.pop stack in current_subs := (Element (!current_name, !current_atts, List.rev !current_subs)) :: old_subs; current_name := old_name; current_atts := old_atts; current_excl := old_excl done; (* Remove one more element: the element containing the element * currently being closed. *) let old_name, old_atts, old_subs, old_excl = Stack.pop stack in current_subs := (Element (!current_name, !current_atts, List.rev !current_subs)) :: old_subs; current_name := old_name; current_atts := old_atts; current_excl := old_excl; (* Go on *) parse_next() end | Eof -> raise End_of_scan | _ -> parse_next() in try parse_next(); (* never returns. Will get a warning X *) assert false with End_of_scan -> (* Close all remaining elements: *) while Stack.length stack > 0 do let old_name, old_atts, old_subs, old_excl = Stack.pop stack in current_subs := Element (!current_name, !current_atts, List.rev !current_subs) :: old_subs; current_name := old_name; current_atts := old_atts; current_excl := old_excl done; List.rev !current_subs ;; let parse ?dtd ?return_declarations ?return_pis ?return_comments ch = let buf = Netchannels.lexbuf_of_in_obj_channel ch in parse_document ?dtd ?return_declarations ?return_comments ?return_pis buf ;; type xmap_value = | Xmap_attribute of string * string * string (* elname, attname, attval *) | Xmap_data of string option * string (* elname, pcdata *) let rec xmap f surelem doc = (* surdoc: surrounding element *) match doc with | Element(name,atts,subdocs) -> (match name with | "!" | "?" | "--" -> Element(name,atts,xmap_list f None subdocs) | _ -> let atts' = List.map (fun (aname,aval) -> aname, f (Xmap_attribute(name, aname, aval)) ) atts in let subdocs' = xmap_list f (Some name) subdocs in Element(name,atts',subdocs') ) | Data s -> Data(f (Xmap_data(surelem,s))) and xmap_list f surelem l = List.map (xmap f surelem) l;; let map_list f l = xmap_list (function | Xmap_attribute(_, _, v) -> f v | Xmap_data(_, v) -> f v ) None l let encode ?(enc = `Enc_iso88591) ?(prefer_name = true) ?(dtd = html40_dtd) dl = let enc_string = Netencoding.Html.encode ~in_enc:enc ~out_enc:`Enc_usascii ~prefer_name () in let dtd_hash = hashtbl_from_alist dtd in let enc_node = function | Xmap_attribute(_, _, v) -> enc_string v | Xmap_data(None, v) -> enc_string v | Xmap_data(Some el, v) -> let is_special = try snd(Hashtbl.find dtd_hash el) = `Special with Not_found -> false in if is_special then v else enc_string v in xmap_list enc_node None dl ;; let decode ?(enc = `Enc_iso88591) ?subst ?entity_base ?lookup ?(dtd = html40_dtd) dl = let dec_string = Netencoding.Html.decode ~in_enc:enc ~out_enc:enc ?subst ?entity_base ?lookup () in let dtd_hash = hashtbl_from_alist dtd in let dec_node = function | Xmap_attribute(_, _, v) -> dec_string v | Xmap_data(None, v) -> dec_string v | Xmap_data(Some el, v) -> let is_special = try snd(Hashtbl.find dtd_hash el) = `Special with Not_found -> false in if is_special then v else dec_string v in xmap_list dec_node None dl ;; let quote_quot_re = Netstring_str.regexp "\"";; let write_ ~dtd ~xhtml write_os doc = let quote_quot s = Netstring_str.global_substitute quote_quot_re (fun _ _ -> """) s in let rec trav doc = match doc with Element(name,atts,subdocs) -> ( match name with "!" -> write_os "<!"; write_os (List.assoc "contents" atts); write_os ">"; | "?" -> write_os "<?"; write_os (List.assoc "contents" atts); write_os ">"; | "--" -> write_os "<!--"; write_os (List.assoc "contents" atts); write_os "-->"; | _ -> let is_empty = try let _, constr = List.assoc name dtd in constr = `Empty with Not_found -> false in write_os "<"; write_os name; List.iter (fun (aname,aval) -> write_os " "; write_os aname; write_os "=\""; write_os (quote_quot aval); write_os "\""; ) atts; if is_empty then (* Ignore subdocs (even if <> []) because they should not be there. *) write_os (if xhtml then "/>" else ">") else begin write_os ">"; List.iter trav subdocs; write_os "</"; write_os name; write_os ">"; end ) | Data s -> write_os s in try List.iter trav doc with Not_found -> failwith "write" ;; let write ?(dtd = html40_dtd) ?(xhtml = true) ch doc = write_ ~dtd ~xhtml (ch # output_string) doc