Plasma GitLab Archive
Projects Blog Knowledge

open Html;;
open Scan;;

exception End_of_scan;;


let no_end_tag =  (* empty HTML elements *)
  [ "isindex";
    "base";
    "meta";
    "link";
    "p";
    "hr";
    "input";
    "img";
    "param";
    "basefont";
    "br";
    "area";
  ]
;;


let special_tag =   (* other lexical rules *)
  [ "script";
    "style";
  ]
;;


let rec parse_comment buf =
  let t = scan_comment buf in
  match t with
      Mcomment ->
	parse_comment buf
    | Eof ->
	raise End_of_scan
    | _ ->
	()
;;


let rec parse_doctype buf =
  let t = scan_doctype buf in
  match t with
      Mdoctype ->
	parse_doctype buf
    | Eof ->
	raise End_of_scan
    | _ ->
	()
;;


let parse_document buf =
  let current_name = ref "" in
  let current_atts = ref [] in
  let current_subs = ref [] in
  let stack = Stack.create() in

  let rec parse_atts () =
    let rec next_no_space() =
      match scan_element buf with
	  Space _ -> next_no_space()
	| t -> t
    in
    match next_no_space() with
	Relement -> []
      | Name n ->
	  begin match next_no_space() with
	      Is ->
		begin match next_no_space() with
		    Name v ->
		      (String.lowercase n, String.uppercase v) :: parse_atts()
		  | Literal v ->
		      (String.lowercase n,v) :: parse_atts()
		  | Eof ->
		      raise End_of_scan
		  | Relement ->
		      (* Illegal *)
		      []
		  | _ ->
		      (* Illegal *)
		      parse_atts()
		end
	    | Eof ->
		raise End_of_scan
	    | Relement ->
		(* Illegal *)
		[]
	    | _ ->
		(* Illegal *)
		parse_atts()
	  end
      | Eof ->
	  raise End_of_scan
      | _ ->
	  (* Illegal *)
	  parse_atts()
  in

  let rec parse_special name =
    (* Parse until </name> *)
    match scan_special buf with
	Lelementend n ->
	  if n = name then
	    ""
	  else
	    "</" ^ n ^ parse_special name
      | Eof ->
	  raise End_of_scan
      | Cdata s ->
	  s ^ parse_special name
      | _ ->
	  (* Illegal *)
	  parse_special name
  in

  let rec skip_element() =
    (* Skip until ">" *)
    match scan_element buf with
	Relement ->
	  ()
      | Eof ->
	  raise End_of_scan
      | _ ->
	  skip_element()
  in

  let rec parse_next() =
    let t = scan_document buf in
    match t with
	Lcomment -> 
	  parse_comment buf;
	  parse_next()
      | Ldoctype ->
	  parse_doctype buf;
	  parse_next()
      | Lelement name ->
	  let name = String.lowercase name in
	  if List.mem name no_end_tag then begin
	    let atts = parse_atts() in
	    current_subs := (Element(name, atts, [])) :: !current_subs;
	    parse_next()
	  end
	  else if List.mem name special_tag then begin
	    let atts = parse_atts() in
	    let data = parse_special name in
	    (* Read until ">" *)
	    skip_element();
	    current_subs := (Element(name, atts, [Data data])) :: !current_subs;
	    parse_next()
	  end
	  else begin
	    let atts = parse_atts() in
	    Stack.push (!current_name, !current_atts, !current_subs) stack;
	    current_name := name;
	    current_atts := atts;
	    current_subs := [];
	    parse_next()
	  end
      | Cdata data ->
	  current_subs := (Data data) :: !current_subs;
	  parse_next()
      | Lelementend name ->
	  let name = String.lowercase name in
	  (* Read until ">" *)
	  skip_element();
	  (* Search the element to close on the stack: *)
	  let found = ref (name = !current_name) in
	  Stack.iter
	    (fun (old_name, _, _) ->
	       if name = old_name then found := true)
	    stack;
	  (* If not found, the end tag is wrong. Simply ignore it. *)
	  if not !found then
	    parse_next()
	  else begin
	    (* Put the current element on to the stack: *)
	    Stack.push (!current_name, !current_atts, !current_subs) stack;
	    (* If found: Remove the elements from the stack, and append
	     * them to the previous element as sub elements
	     *)
	    let rec remove() =
	      let old_name, old_atts, old_subs = Stack.pop stack in
	        (* or raise Stack.Empty *)
	      if old_name = name then
		old_name, old_atts, old_subs
	      else
		let older_name, older_atts, older_subs = remove() in
		older_name, 
		older_atts,
		(Element (old_name, old_atts, List.rev old_subs) :: older_subs)
	    in
	    let old_name, old_atts, old_subs = remove() in
	    (* Remove one more element: the element containing the element
	     * currently being closed.
	     *)
	    let new_name, new_atts, new_subs = Stack.pop stack in
	    current_name := new_name;
	    current_atts := new_atts;
	    current_subs := (Element (old_name, old_atts, List.rev old_subs)) 
                            :: new_subs;
	    (* Go on *)
	    parse_next()
	  end
      | Eof ->
	  raise End_of_scan
      | _ ->
	  parse_next()
  in
  try
    parse_next();
    List.rev !current_subs
  with
      End_of_scan ->
	(* Close all remaining elements: *)
	Stack.push (!current_name, !current_atts, !current_subs) stack;
	let rec remove() =
	  let old_name, old_atts, old_subs = Stack.pop stack in
	        (* or raise Stack.Empty *)
	  try
	    let older_name, older_atts, older_subs = remove() in
	    older_name, 
	    older_atts,
	    (Element (old_name, old_atts, List.rev old_subs) :: older_subs)
	  with
	      Stack.Empty ->
		old_name, old_atts, old_subs
	in
	let name, atts, subs = remove() in
	List.rev subs
;;


let parse_string s =
  let buf = Lexing.from_string s in
  parse_document buf
;;


let parse_file fname =
  let f = open_in fname in
  try
    let buf = Lexing.from_channel f in
    let doc = parse_document buf in
    close_in f;
    doc
  with
      any ->
	close_in f;
	raise any
;;

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml