/home/cityd/production/web-camlcity/back0/trunk/projects/dl/pxp-1.2.0test2/examples/eventparser/sample.ml

(* $Id: sample.ml 662 2004-05-25 20:57:28Z gerd $
 * ----------------------------------------------------------------------
 *
 *)


(**********************************************************************)
(* Examples for event-based parsing ("SAX-like parsing")              *)
(**********************************************************************)


open Pxp_yacc
open Pxp_lexer_types
open Pxp_types
open Expr
open Exprlex
open Printf


(* dump_event: dumps a single parsing event *)

let dump_event =
  function
      E_start_doc(v,sa,dtd) ->
	printf "E_start_doc version=%s standalone=%b\n" v sa
    | E_end_doc ->
	printf "E_end_doc\n"
    | E_start_tag(name,attlist,_) ->
	printf "E_start_tag %s %s\n" name 
	  (String.concat " " (List.map (fun (n,v) -> n ^ "=" ^ v) attlist))
    | E_end_tag(name,_) ->
	printf "E_end_tag %s\n" name
    | E_char_data data ->
	printf "E_char_data %s\n" data
    | E_pinstr(target,data) ->
	printf "E_pinstr %s %s\n" target data
    | E_comment data ->
	printf "E_comment %s\n" data
    | E_position(ent,line,col) ->
	printf "E_position %s line=%d col=%d\n" ent line col
    | E_error e ->
	printf "E_error %s\n" (Printexc.to_string e)
    | E_end_of_stream ->
	printf "E_end_of_stream\n"
;;


(* parse: prints the events while parsing the passed string *)

let parse s =
  process_entity
    default_config
    (`Entry_document[])
    (create_entity_manager default_config (from_string s))
    dump_event;
  flush stdout
;;


(* curly_parse: demonstrates how to use escape_contents. The character
 * { escapes from normal parsing and calls [escape]. The arithmetic
 * expression inside { ... } is evaluated, and the result is taken 
 * as the text content.
 * Try: curly_parse "<a>{123 + 5}</a>"
 *      curly_parse "<a>{{123 + 5}}</a>"
 *      curly_parse "<a>{123 + 5</a>}</a>"
 *)

let inc_col (l,c) = (l,c+1);;
let inc_line (l,c) = (l+1,0);;
let add_col n (l,c) = (l,c+n);;


let curly_parse s =
  let parse_expr mng =
    (* We get now the current lexical buffer of PXP, and use it for
     * our own parsing. In particular, we call Expr.topexpr
     * to parse the arithmetic expression. While parsing,
     * we track the current line and column (function [scan]).
     *)
    let line_col = ref (mng # current_line_column) in
    (* Note: current_line_column returns the position of the beginning of
     * the token that has been parsed last. The last token was "{" (Lcurly).
     * So we must add 1 to this position to get the position of the 
     * beginning of the next token.
     *)
    line_col := inc_col !line_col;
    let rec scan buf =
      match scan_expr buf with
	  Newline -> 
	    line_col := inc_line !line_col; 
	    scan buf
	| Space ->
	    let n = Lexing.lexeme_end buf - Lexing.lexeme_start buf in
	    line_col := add_col n !line_col;
	    scan buf
	| tok -> 
	    let n = Lexing.lexeme_end buf - Lexing.lexeme_start buf in
	    line_col := add_col n !line_col;
	    tok
    in
    let lexbuf = mng # current_lexbuf in
    let value = topexpr scan lexbuf in
    printf "Result of expression: %d\n" value;
    mng # update_line_column !line_col;
    string_of_int value
  in

  let escape_contents tok mng =
    (* This function is called when "{", "{{", "}", or "}}" are found in
     * character node context.
     *)
    match tok with
	Lcurly ->
	  (* "{" found: *)
	  parse_expr mng
      | LLcurly -> 
	  (* "{{" found: map to "{" *)
	  "{"
      | Rcurly -> 
	  (* "}" found *)
	  failwith "Single brace } not allowed"
      | RRcurly -> 
	  (* "}}" found: map to "}" *)
	  "}"
      | _ -> assert false
  in

  let escape_attributes tok pos mng =
    (* This function is called when "{", "{{", "}", or "}}" are found in
     * attribute values.
     *)
    match tok with
	Lcurly ->
	  (* "{" found: *)
	  parse_expr mng
      | LLcurly ->
	  (* "{{" found: *)
	  "{"
      | Rcurly ->
	  (* "}" found: *)
	  failwith "Single brace } not allowed"
      | RRcurly ->
	  (* "}}" found: *)
	  "}"
      | _ ->
	  assert false
  in

  let config = { default_config with 
		   escape_contents = Some escape_contents;
		   escape_attributes = Some escape_attributes;
	       } in
  process_entity
    config
    (`Entry_document[])
    (create_entity_manager config (from_string s))
    dump_event;
  flush stdout
;;


(* rec_curly_parse: Here, escape_contents calls the XML parser recursively,
 * i.e. you can write XML documents inside curly braces, like in
 * rec_curly_parse "<A> { <B> x </B> } </A>" or
 * rec_curly_parse "<A att='{ <B> x </B> }'>y</A>"
 *
 * This is currently very experimental!
 *)

class any_entity_id = object end ;;
  (* An entity ID is an object without properties except identity *)

let rec_curly_parse s =
  let ent_id_guard = new any_entity_id in
  let base_config = default_config in

  let rec escape ent_id tok mng =
    (* ent_id: is the entity ID containing the last Lcurly, or ent_id_guard
     *  when there was none yet
     *)
    let current_ent = mng # current_entity in
    let current_ent_id = (current_ent :> entity_id) in
    match tok with
	Lcurly ->
	  printf "Starting subparser...\n";
	  (* Because [current_ent] is already open, we cannot use it as
	   * top-level entity in [process_entity] (it is not possible to
	   * open an entity several times). The solution is [sub_ent],
	   * a so-called entity section that behaves much like [current_ent]
	   * and shares most of the state with [current_ent], but pretends
	   * it were an entity of its own.
	   *)
	  let sub_ent = new Pxp_entity.entity_section current_ent in
	  let sub_ent_id = (sub_ent :> entity_id) in
	  let sub_config =
	    { base_config with
		escape_contents = Some (escape sub_ent_id) ;
		escape_attributes = Some (escape_att sub_ent_id) ;
	    }
	  in
	  (* Pushing sub_ent makes it the top-level entity: *)
	  mng # push_entity sub_ent;  
	  process_entity
	    sub_config
	    (`Entry_document[])
	    mng
	    dump_event;
	  assert(mng # current_entity = sub_ent);
	  (* Pop sub_ent afterwards: *)
	  mng # pop_entity ();
	  ""
      | LLcurly -> "{"
      | Rcurly -> 
	  if ent_id = ent_id_guard then
	    (* A right curly brace without matching left curly brace *)
	    failwith "Single brace } not allowed"
	  else 
	    if ent_id = current_ent_id then (
	      (* The normal case: *)
	      printf "Stopping subparser...\n";
	      ignore(current_ent # close_entity);
	      ""
		(* Causes that the current [process_entity] parser invocation
		 * terminates (if possible)
		 *)
	    )
	    else
	      (* ent_id <> current_ent_id: This can happen if entities and
	       * braces are used in strange ways:
	       * <!DOCTYPE a [ <!ENTITY brace '}'> ]> 
	       * <a> { <b>xxx</b> &brace; </a>
	       *)
	      failwith "Bad nesting of entities and braces {...}"
	
      | RRcurly -> "}"
      | _ -> assert false
  and escape_att ent_id tok pos mng = escape ent_id tok mng
  in
  let config = 
    { base_config with 
	escape_contents = Some (escape ent_id_guard);
	escape_attributes = Some (escape_att ent_id_guard);
    } in

  process_entity
    config
    (`Entry_document[])
    (create_entity_manager config (from_string s))
    dump_event;
  flush stdout
;;


(* parse_expr: An example for process_expr that parses the expressions
 * found in a string one after another.
 * Example:
 *      parse_expr "<?pi?><abc>def</abc> <qrt>def</qrt> "
 *
 * Unfortunately, we need the undocumented methods [open_entity] and
 * [close_entity] from [Pxp_entity], and we need the knowledge that
 * a [Begin_entity] token can be found at the beginning of the entity,
 * and that an [End_entity] token signals the end of the entity. Using
 * [process_expr] is quite low-level.
 *)

let parse_expr s =
  let config =
    { default_config with
	enable_pinstr_nodes = true;
	enable_comment_nodes = true
    } in
  let m = create_entity_manager config (from_string s) in
  m # current_entity # open_entity true Pxp_lexer_types.Content;
  let begin_entity_token = !(m # yy_get_next_ref)() in
  assert (begin_entity_token = Begin_entity);
  let tok = ref Ignore in       (* Ignore does not occur in the token stream *)
  while !tok <> End_entity do
    let first_token =
      if !tok <> Ignore then Some !tok else None in
    printf "*** Calling process_expr...\n";
    process_expr ?first_token ~following_token:tok config m dump_event;
  done;
  ignore(m # current_entity # close_entity);
;;
This web site is published by Informatikbüro Gerd Stolpmann
Plasma	GitLab	Archive
Projects	Blog	Knowledge