Plasma GitLab Archive
Projects Blog Knowledge

(* $Id: nethtml.ml 1296 2009-11-18 13:27:41Z ChriS $
 * ----------------------------------------------------------------------
 *
 *)

open Nethtml_scanner;;

type document =
    Element of (string  *  (string*string) list  *  document list)
  | Data of string
;;


exception End_of_scan;;
exception Found;;


type element_class =         (* What is the class of an element? *)
  [ `Inline
  | `Block
  | `Essential_block
  | `None
  | `Everywhere
  ]
;;

type model_constraint =      (* The constraint the subelements must fulfill *)
  [ `Inline
  | `Block
  | `Flow         (* = `Inline or `Block *)
  | `Empty
  | `Any
  | `Special
  | `Elements of string list  (* Enumeration of allowed elements *)
  | `Or of (model_constraint * model_constraint)
  | `Except of (model_constraint * model_constraint)
  | `Sub_exclusions of (string list * model_constraint)
  ]
;;

type simplified_dtd =
    (string * (element_class * model_constraint)) list


let ( |. ) a b = `Or(a,b);;
let ( -. ) a b = `Except(a,b);;


let block_elements =
  (* Only used for exclusions *)
  [ "p"; "dl"; "div"; "center"; "noscript"; "noframes"; "blockquote"; "form";
    "isindex"; "hr"; "table"; "fieldset"; "address"; "h1"; "h2"; "h3"; "h4";
    "h5"; "h6"; "pre"; "ul"; "ol"; "dir"; "menu" ];;

let html40_dtd =
  [ (* --------- INLINE ELEMENTS ------------ *)
    (* %fontstyle; *)
    "tt",                 (`Inline, `Inline);
    "i",                  (`Inline, `Inline);
    "b",                  (`Inline, `Inline);
    "big",                (`Inline, `Inline);
    "small",              (`Inline, `Inline);
    (* transitional: *)
    "u",                  (`Inline, `Inline);
    "s",                  (`Inline, `Inline);
    "strike",             (`Inline, `Inline);
    (* %phrase; *)
    "em",                 (`Inline, `Inline);
    "strong",             (`Inline, `Inline);
    "dfn",                (`Inline, `Inline);
    "code",               (`Inline, `Inline);
    "samp",               (`Inline, `Inline);
    "kbd",                (`Inline, `Inline);
    "var",                (`Inline, `Inline);
    "cite",               (`Inline, `Inline);
    "abbr",               (`Inline, `Inline);
    "acronym",            (`Inline, `Inline);
    (* %special; *)
    "sup",                (`Inline, `Inline);
    "sub",                (`Inline, `Inline);
    "span",               (`Inline, `Inline);
    "bdo",                (`Inline, `Inline);
    "br",                 (`Inline, `Empty);
    "a",                  (`Inline, `Sub_exclusions(["a"],`Inline));
    "img",                (`Inline, `Empty);
    "object",             (`Inline, (`Flow |. `Elements ["param"]));
    "script",             (`Inline, `Special);
    "map",                (`Inline, (`Flow |. `Elements ["area"]));
    "q",                  (`Inline, `Inline);
    (* transitional: *)
    "applet",             (`Inline, (`Flow |. `Elements ["param"]));
    "font",               (`Inline, `Inline);
    "basefont",           (`Inline, `Empty);
    "iframe",             (`Inline, `Flow);
    (* %formctrl; *)
    "input",              (`Inline, `Empty);
    "select",             (`Inline, `Elements ["optgroup"; "option"]);
    "textarea",           (`Inline, `Elements []);    (* #PCDATA *)
    "label",              (`Inline, `Sub_exclusions( ["label"],
						    `Inline));
    "button",             (`Inline, `Sub_exclusions( ["a"; "input"; "select";
						     "textarea"; "label";
						     "button"; "form";
						     "fieldset"; "isindex";
						     "iframe"],
						    `Flow));
    (* ------------ BLOCK ELEMENTS ----------*)
    "p",                  (`Block, `Inline);
    (* %heading; *)
    "h1",                 (`Block, `Inline); 
    "h2",                 (`Block, `Inline);
    "h3",                 (`Block, `Inline);
    "h4",                 (`Block, `Inline);
    "h5",                 (`Block, `Inline);
    "h6",                 (`Block, `Inline);
    (* %list; *)
    "ul",                 (`Block, `Elements ["li"]);
    "ol",                 (`Block, `Elements ["li"]);
    (* transitional: *)
    "dir",                (`Block, `Sub_exclusions( block_elements,
						   `Elements ["li"]));
    "menu",               (`Block, `Sub_exclusions( block_elements,
						   `Elements ["li"]));
    (* %preformatted; *)
    "pre",                (`Block, `Sub_exclusions( [ "img"; "object"; "applet";
						      "big"; "small"; "sub"; 
						      "sup"; "font"; "basefont"],
						    `Inline ));
    (* other: *)
    "dl",                 (`Block, `Elements ["dt"; "dd"]);
    "div",                (`Block, `Flow);
    "noscript",           (`Block, `Flow);
    "blockquote",         (`Block, (`Flow |. `Elements ["script"]));
                          (* strict DTD has `Block here *)
    "form",               (`Block, `Sub_exclusions( ["form"],
						    `Flow |. 
						       `Elements ["script"]));
                          (* strict DTD has `Block here *)
    "hr",                 (`Block, `Empty);
    "table",              (`Block, `Elements ["caption"; "col"; "colgroup";
					      "thead"; "tfoot"; "tbody"; "tr"]);
    "fieldset",           (`Block, (`Flow |. `Elements ["legend"]));
    "address",            (`Block, `Inline);
    (* transitional: *)
    "center",             (`Block, `Flow);
    "noframes",           (`Block, `Flow);
    "isindex",            (`Block, `Empty);
    (* ------------ OTHER ELEMENTS ----------*)
    "body",               (`None, (`Flow |. `Elements ["script"]));
                          (* strict DTD has `Block here *)
    "area",               (`None, `Empty);
    "link",               (`None, `Empty);
    "param",              (`None, `Empty);
    "ins",                (`Everywhere, `Flow);
    "del",                (`Everywhere, `Flow);
    "dt",                 (`None, `Inline);
    "dd",                 (`None, `Flow);
    "li",                 (`None, `Flow);
    "optgroup",           (`None, `Elements ["option"]);
    "option",             (`None, `Elements []);   (* #PCDATA *)
    "legend",             (`None, `Inline);
    "caption",            (`None, `Inline);
    "thead",              (`None, `Elements ["tr"]);
    "tbody",              (`None, `Elements ["tr"]);
    "tfoot",              (`None, `Elements ["tr"]);
    "colgroup",           (`None, `Elements ["col"]);
    "col",                (`None, `Empty);
    "tr",                 (`None, `Elements ["th"; "td"]);
    "th",                 (`None, `Flow);
    "td",                 (`None, `Flow);
    "head",               (`None, `Elements ["title"; "base"; "script";
					     "style"; "meta"; "link";
					     "object"]);
    "title",              (`None, `Elements []);   (* #PCDATA *)
    "base",               (`None, `Empty);
    "meta",               (`None, `Empty);
    "style",              (`None, `Special);
    "html",               (`None, (`Flow |. 
				       `Elements ["head"; 
						  "title"; "base"; "script";
						  "style"; "meta"; "link";
						  "object";
						  "body"; "frameset"]));
    (* transitional: *)
    "frameset",           (`None, `Elements ["frameset"; "frame"; "noframes"]);
    "frame",              (`None, `Empty);
  ]
;;


let relax_dtd dtd =
  (* Changes (`Inline, `Inline) constraints into (`Inline, `Flow). *)
  let rec relax_model m =
    match m with
	`Inline -> `Flow
      | `Sub_exclusions(l,m') -> `Sub_exclusions(l,relax_model m')
      | other -> other
  in
  List.map
    (fun (name, (elclass, elconstr)) ->
       match elclass with
	   `Inline ->
	     (name, (elclass, relax_model elconstr))
	 | other ->
	     (name, (elclass, elconstr))
    )
    dtd
;;


let essential_blocks dtd elements =
  (* Changes the passed block elements into essential block elements *)
  List.map
    (fun (name, (elclass, elconstr)) ->
       match elclass with
	   `Block when List.mem name elements ->
	     (name, ( `Essential_block, elconstr))
         | other ->
	     (name, (elclass, elconstr))
    )
    dtd
;;


let relaxed_html40_dtd =
  essential_blocks
    (relax_dtd html40_dtd)
    [ "body"; "table"; "ol"; "ul"; "dl" ]
;;


let rec parse_comment buf =
  let t = scan_comment buf in
  match t with
      Mcomment ->
	let s = Lexing.lexeme buf in
	s ^ parse_comment buf
    | Eof ->
	raise End_of_scan
    | _ ->
	(* must be Rcomment *)
	""
;;


let rec parse_doctype buf =
  let t = scan_doctype buf in
  match t with
      Mdoctype ->
	let s = Lexing.lexeme buf in
	s ^ parse_doctype buf
    | Eof ->
	raise End_of_scan
    | _ ->
	(* must be Rdoctype *)
	""
;;


let rec parse_pi buf =
  let t = scan_pi buf in
  match t with
      Mpi ->
	let s = Lexing.lexeme buf in
	s ^ parse_pi buf
    | Eof ->
	raise End_of_scan
    | _ ->
	(* must be Rpi *)
	""
;;


let hashtbl_from_alist l =
  let ht = Hashtbl.create (List.length l) in
  List.iter
    (fun (k, v) ->
       Hashtbl.add ht k v)
    l;
  ht
;;


module S = struct
  type t = string
  let compare = (Pervasives.compare : string -> string -> int)
end

module Strset = Set.Make(S);;


let parse_document ?(dtd = html40_dtd) 
                   ?(return_declarations = false) 
                   ?(return_pis = false)
                   ?(return_comments = false) buf =
  let current_name = ref "" in
  let current_atts = ref [] in
  let current_subs = ref [] in
  let current_excl = ref Strset.empty in      (* current exclusions *)
  let stack = Stack.create() in
  let dtd_hash = hashtbl_from_alist dtd in

  let model_of element_name =
    if element_name = "" then
      (`Everywhere, `Any)
    else
      let extract =
	function
	    (eclass, `Sub_exclusions(_,m)) -> eclass, m
	  | m -> m
      in
      try
	extract(Hashtbl.find dtd_hash element_name)
      with
	  Not_found -> (`Everywhere, `Any)
  in

  let exclusions_of element_name =
    if element_name = "" then
      []
    else
      let extract =
	function
	    (eclass, `Sub_exclusions(l,_)) -> l
	  | _ -> []
      in
      try
	extract(Hashtbl.find dtd_hash element_name)
      with
	  Not_found -> []
  in

  let is_possible_subelement parent_element parent_exclusions sub_element =
    let (sub_class, _) = model_of sub_element in
    let rec eval m =
      match m with
	  `Inline     -> sub_class = `Inline
	| `Block      -> sub_class = `Block  || sub_class = `Essential_block
	| `Flow       -> sub_class = `Inline || sub_class = `Block ||
		         sub_class = `Essential_block
	| `Elements l -> List.mem sub_element l
	| `Any        -> true
	| `Or(m1,m2)  -> eval m1 || eval m2
	| `Except(m1,m2) -> eval m1 && not (eval m2)
	| `Empty      -> false
	| `Special    -> false
	| `Sub_exclusions(_,_) -> assert false
    in
    (sub_class = `Everywhere) || (
	      (not (Strset.mem sub_element parent_exclusions)) &&
	      let (_, parent_model) = model_of parent_element in
	      eval parent_model
	    )
  in

  let unwind_stack sub_name =
    (* If the current element is not a possible parent element for sub_name,
     * search the parent element in the stack.
     * Either the new current element is the parent, or there was no
     * possible parent. In the latter case, the current element is the
     * same element as before.
     *)
    let backup = Stack.create() in
    let backup_name = !current_name in
    let backup_atts = !current_atts in
    let backup_subs = !current_subs in
    let backup_excl = !current_excl in
    try
      while not (is_possible_subelement !current_name !current_excl sub_name) do
	(* Maybe we are not allowed to end the current element: *)
	let (current_class, _) = model_of !current_name in
	if current_class = `Essential_block then raise Stack.Empty;
	(* End the current element and remove it from the stack: *)
	let grant_parent = Stack.pop stack in
	Stack.push grant_parent backup;        (* Save it; may we need it *)
	let (gp_name, gp_atts, gp_subs, gp_excl) = grant_parent in
	(* If gp_name is an essential element, we are not allowed to close
	 * it implicitly, even if that violates the DTD.
	 *)
	let current = Element (!current_name, !current_atts, 
			       List.rev !current_subs) in
	current_name := gp_name;
	current_atts := gp_atts;
	current_excl := gp_excl;
	current_subs := current :: gp_subs
      done;
    with
	Stack.Empty ->
	  (* It did not work! Push everything back to the stack, and
	   * resume the old state.
	   *)
	  while Stack.length backup > 0 do
	    Stack.push (Stack.pop backup) stack
	  done;
	  current_name := backup_name;
	  current_atts := backup_atts;
	  current_subs := backup_subs;
	  current_excl := backup_excl
  in

  let parse_atts() =
    let rec next_no_space p_string =
      (* p_string: whether string literals in quotation marks are allowed *)
      let tok =
	if p_string then
	  scan_element_after_Is buf
	else
	  scan_element buf in
      match tok with
	  Space _ -> next_no_space p_string
	| t -> t
    in

    let rec parse_atts_lookahead next =
      match next with
	| Relement  -> ( [], false )
	| Relement_empty  -> ( [], true )
      	| Name n ->
	    ( match next_no_space false with
	      	Is ->
		  ( match next_no_space true with
		      Name v ->
			let toks, is_empty =
			  parse_atts_lookahead (next_no_space false) in
		      	( (String.lowercase n, v) :: toks, is_empty )
		    | Literal v ->
			let toks, is_empty =
			  parse_atts_lookahead (next_no_space false) in
		      	( (String.lowercase n,v) :: toks, is_empty )
		    | Eof ->
		      	raise End_of_scan
		    | Relement ->
		      	(* Illegal *)
		      	( [], false )
		    | Relement_empty ->
		      	(* Illegal *)
		      	( [], true )
		    | _ ->
		      	(* Illegal *)
		      	parse_atts_lookahead (next_no_space false)
		  )
	      | Eof ->
		  raise End_of_scan
	      | Relement ->
		  (* <tag name> <==> <tag name="name"> *)
		  ( [ String.lowercase n, String.lowercase n ], false)
	      | Relement_empty ->
		  (* <tag name> <==> <tag name="name"> *)
		  ( [ String.lowercase n, String.lowercase n ], true)
	      | next' ->
		  (* assume <tag name ... > <==> <tag name="name" ...> *)
		  let toks, is_empty = 
		    parse_atts_lookahead next' in
		  ( ( String.lowercase n, String.lowercase n ) :: toks,
		    is_empty)
	    )
      	| Eof ->
	    raise End_of_scan
      	| _ ->
	    (* Illegal *)
	    parse_atts_lookahead (next_no_space false)
    in
    parse_atts_lookahead (next_no_space false)
  in

  let rec parse_special name =
    (* Parse until </name> *)
    match scan_special buf with
      | Lelementend n ->
	  if String.lowercase n = name then
	    ""
	  else
	    "</" ^ n ^ parse_special name
      | Eof ->
	  raise End_of_scan
      | Cdata s ->
	  s ^ parse_special name
      | _ ->
	  (* Illegal *)
	  parse_special name
  in

  let rec skip_element() =
    (* Skip until ">" (or "/>") *)
    match scan_element buf with
      | Relement | Relement_empty ->
	  ()
      | Eof ->
	  raise End_of_scan
      | _ ->
	  skip_element()
  in

  let rec parse_next() =
    let t = scan_document buf in
    match t with
      | Lcomment ->
	  let comment = parse_comment buf in
	  if return_comments then
	    current_subs := (Element("--",["contents",comment],[])) :: !current_subs;
	  parse_next()
      | Ldoctype ->
	  let decl = parse_doctype buf in
	  if return_declarations then
	    current_subs := (Element("!",["contents",decl],[])) :: !current_subs;
	  parse_next()
      | Lpi ->
	  let pi = parse_pi buf in
	  if return_pis then
	    current_subs := (Element("?",["contents",pi],[])) :: !current_subs;
	  parse_next()
      | Lelement name ->
	  let name = String.lowercase name in
	  let (_, model) = model_of name in
	  ( match model with
		`Empty ->
		  let atts, _ = parse_atts() in
		  unwind_stack name;
		  current_subs := (Element(name, atts, [])) :: !current_subs;
		  parse_next()
	      | `Special ->
		  let atts, is_empty = parse_atts() in
		  unwind_stack name;
		  let data = 
		    if is_empty then 
		      ""
		    else (
		      let d = parse_special name in
		      (* Read until ">" *)
		      skip_element();
		      d
		    ) in
		  current_subs := (Element(name, atts, [Data data])) :: !current_subs;
		  parse_next()
	      | _ ->
		  let atts, is_empty = parse_atts() in
		  (* Unwind the stack until we find an element which can be
		   * the parent of the new element:
		   *)
		  unwind_stack name;
		  if is_empty then (
		    (* Simple case *)
		    current_subs := (Element(name, atts, [])) :: !current_subs;
		  )
		  else (
		    (* Push the current element on the stack, and this element
		     * becomes the new current element:
		     *)
		    let new_excl = exclusions_of name in
		    Stack.push 
		      (!current_name, 
		       !current_atts, !current_subs, !current_excl)
		      stack;
		    current_name := name;
		    current_atts := atts;
		    current_subs := [];
		    List.iter
		      (fun xel -> current_excl := Strset.add xel !current_excl)
		      new_excl;
		  );
		  parse_next()
	  )
      | Cdata data ->
	  current_subs := (Data data) :: !current_subs;
	  parse_next()
      | Lelementend name ->
	  let name = String.lowercase name in
	  (* Read until ">" *)
	  skip_element();
	  (* Search the element to close on the stack: *)
	  let found = 
	    (name = !current_name) ||
	    try
	      Stack.iter
		(fun (old_name, _, _, _) ->
		   if name = old_name then raise Found;
		   match model_of old_name with
		       `Essential_block, _ -> raise Not_found;
			 (* Don't close essential blocks implicitly *)
		     | _ -> ())
		stack;
	      false
	    with
		Found -> true
	      | Not_found -> false
	  in
	  (* If not found, the end tag is wrong. Simply ignore it. *)
	  if not found then
	    parse_next()
	  else begin
	    (* If found: Remove the elements from the stack, and append
	     * them to the previous element as sub elements
	     *)
	    while !current_name <> name do
	      let old_name, old_atts, old_subs, old_excl = Stack.pop stack in
	      current_subs := (Element (!current_name, !current_atts,
					List.rev !current_subs)) :: old_subs;
	      current_name := old_name;
	      current_atts := old_atts;
	      current_excl := old_excl
	    done;
	    (* Remove one more element: the element containing the element
	     * currently being closed.
	     *)
	    let old_name, old_atts, old_subs, old_excl = Stack.pop stack in
	    current_subs := (Element (!current_name, !current_atts,
				      List.rev !current_subs)) :: old_subs;
	    current_name := old_name;
	    current_atts := old_atts;
	    current_excl := old_excl;
	    (* Go on *)
	    parse_next()
	  end
      | Eof ->
	  raise End_of_scan
      | _ ->
	  parse_next()
  in
  try
    parse_next();  (* never returns. Will get a warning X *)
    assert false
  with
      End_of_scan ->
	(* Close all remaining elements: *)
	while Stack.length stack > 0 do
	  let old_name, old_atts, old_subs, old_excl = Stack.pop stack in
	  current_subs := Element (!current_name,
				   !current_atts,
				   List.rev !current_subs) :: old_subs;
	  current_name := old_name;
	  current_atts := old_atts;
	  current_excl := old_excl
	done;
	List.rev !current_subs
;;

let parse ?dtd ?return_declarations ?return_pis ?return_comments ch =
  let buf = Netchannels.lexbuf_of_in_obj_channel ch in
  parse_document ?dtd ?return_declarations ?return_comments ?return_pis buf
;;  


type xmap_value =
  | Xmap_attribute of string * string * string (* elname, attname, attval *)
  | Xmap_data of string option * string        (* elname, pcdata *)

let rec xmap f surelem doc =
  (* surdoc: surrounding element *)
  match doc with
    | Element(name,atts,subdocs) ->
	(match name with
	   | "!"
	   | "?"
	   | "--" ->
	       Element(name,atts,xmap_list f None subdocs)
	   | _ ->
	       let atts' =
		 List.map
		   (fun (aname,aval) ->
		      aname, f (Xmap_attribute(name, aname, aval))
		   )
		   atts
	       in
	       let subdocs' =  xmap_list f (Some name) subdocs in
	       Element(name,atts',subdocs')
	)
    | Data s ->
	Data(f (Xmap_data(surelem,s)))
and xmap_list f surelem l = List.map (xmap f surelem) l;;

let map_list f l =
  xmap_list
    (function
       | Xmap_attribute(_, _, v) -> f v
       | Xmap_data(_, v) -> f v
    )
    None
    l


let encode ?(enc = `Enc_iso88591) ?(prefer_name = true) ?(dtd = html40_dtd)
           dl = 
  let enc_string =
    Netencoding.Html.encode 
      ~in_enc:enc ~out_enc:`Enc_usascii ~prefer_name () in
  let dtd_hash = hashtbl_from_alist dtd in
  let enc_node = 
    function
      | Xmap_attribute(_, _, v) -> enc_string v
      | Xmap_data(None, v) -> enc_string v
      | Xmap_data(Some el, v) ->
	  let is_special =
	    try snd(Hashtbl.find dtd_hash el) = `Special
	    with Not_found -> false in
	  if is_special then
	    v
	  else
	    enc_string v in
  xmap_list enc_node None dl
;;

let decode ?(enc = `Enc_iso88591) ?subst ?entity_base ?lookup 
           ?(dtd = html40_dtd)
           dl = 
  let dec_string =
    Netencoding.Html.decode 
      ~in_enc:enc ~out_enc:enc ?subst ?entity_base ?lookup () in
  let dtd_hash = hashtbl_from_alist dtd in
  let dec_node = 
    function
      | Xmap_attribute(_, _, v) -> dec_string v
      | Xmap_data(None, v) -> dec_string v
      | Xmap_data(Some el, v) ->
	  let is_special =
	    try snd(Hashtbl.find dtd_hash el) = `Special
	    with Not_found -> false in
	  if is_special then
	    v
	  else
	    dec_string v in
  xmap_list dec_node None dl
;;


let quote_quot_re = Netstring_str.regexp "\"";;

let write_ ~dtd ~xhtml write_os doc =
  let quote_quot s =
    Netstring_str.global_substitute quote_quot_re 
      (fun _ _ -> "&quot;")
      s
  in
  let rec trav doc =
    match doc with
	Element(name,atts,subdocs) ->
	  ( match name with
		"!" ->
		  write_os "<!";
		  write_os (List.assoc "contents" atts);
		  write_os ">";
	      | "?" ->
		  write_os "<?";
		  write_os (List.assoc "contents" atts);
		  write_os ">";
	      | "--" ->
		  write_os "<!--";
		  write_os (List.assoc "contents" atts);
		  write_os "-->";
	      | _ ->
		  let is_empty =
		    try 
		      let _, constr = List.assoc name dtd in
		      constr = `Empty
		    with
			Not_found -> false
		  in
		  write_os "<";
		  write_os name;
		  List.iter
		    (fun (aname,aval) ->
		       write_os " ";
		       write_os aname;
		       write_os "=\"";
		       write_os (quote_quot aval);
		       write_os "\"";
		    )
		    atts;
		  if is_empty then
		    (* Ignore subdocs (even if <> []) because they should
		       not be there. *)
		    write_os (if xhtml then "/>" else ">")
                  else begin
		    write_os ">";
		    List.iter trav subdocs;
		    write_os "</";
		    write_os name;
		    write_os ">";
		  end
	  )
      | Data s ->
	  write_os s
  in
  try
    List.iter trav doc
  with
      Not_found -> failwith "write"
;;

let write ?(dtd = html40_dtd) ?(xhtml = true) ch doc =
  write_ ~dtd ~xhtml (ch # output_string) doc

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml