(* $Id$ -*- tuareg -*- * ---------------------------------------------------------------------- * PXP: The polymorphic XML parser for Objective Caml. * Copyright by Gerd Stolpmann. See LICENSE for details. *) open Pxp_types open Pxp_lexers open Pxp_lexer_types open Pxp_entity_manager open Pxp_dtd open Pxp_aux type context = { mutable current : unit -> token; (* get the current token *) mutable get_next : unit -> token; (* go on to the next token; return it *) mutable current_token : token; (* This is the current token *) mutable manager : entity_manager; (* The entity manager *) } type continuation_state = { cont_context : context; cont_extend_dtd : bool; cont_process_xmldecl : bool; } exception End_of_parsing (* One way to signal that parsing is done *) exception Interrupt_parsing of continuation_state (* Interrupt the parsing loop to process pull-style events *) let make_context ?first_token entity_manager = let c = { current = (fun _ -> assert false); get_next = (fun _ -> assert false); current_token = Eof; manager = entity_manager; } in (* Note that the function which is stored in get_next_ref can be changed * as a side-effect when an entity is opened or closed. The function in * c.get_next must be programmed such that always the current "get_next" * function is executed. *) let get_next_ref = entity_manager # yy_get_next_ref in c.current <- (fun () -> c.current_token); c.get_next <- (fun () -> let tok = !get_next_ref() in c.current_token <- tok; tok); ( match first_token with Some tok -> c.current_token <- tok | None -> ignore(c.get_next()); ); c ;; type extended_entry = [ entry | `Entry_continuation of continuation_state ] (**********************************************************************) (* An array based stack implementation. Advantage: Constant memory * consumption. *) type 't array_stack = { mutable stack_array : 't array; mutable stack_top : int; mutable stack_null : 't; } ;; let stack_create null = (* null: a dummy element *) { stack_array = Array.make 100 null; stack_top = -1; stack_null = null; } ;; let stack_push x st = let top = st.stack_top + 1 in if top >= Array.length st.stack_array then st.stack_array <- Array.append st.stack_array (Array.make 100 st.stack_null); st.stack_array.(top) <- x; st.stack_top <- top ;; let stack_top st = let top = st.stack_top in if top >= 0 then st.stack_array.(top) else raise Stack.Empty ;; let stack_pop st = let top = st.stack_top in if top >= 0 then begin let x = st.stack_array.(top) in st.stack_array.(top) <- st.stack_null; st.stack_top <- top-1; x end else raise Stack.Empty ;; (**********************************************************************) class virtual core_parser init_dtd init_config init_pull_counter = let make_pool_string = pool_string init_config.name_pool in object (self) val mutable dtd = init_dtd (* The DTD being parsed; or the DTD currently assumed *) val lfactory = init_dtd # lexer_factory val config = init_config (* The current configuration *) val mutable n_tags_open = 0 (* Number of begin tags that have been parsed and whose corresponding * end tags have not yet been parsed *) val mutable n_entities_open = 0 (* Number of open Begin_entity tokens. If this number decreases to 0 * the parser will terminate. *) val pull_counter_limit = init_pull_counter val mutable pull_counter = init_pull_counter (* If < 0, pull parsing is disabled. * If >= 0: pull_counter is decreased for every parsing loop. If * zero, pull_counter is reinitialized to pull_counter_limit, and * an Interrupt_parsing exception is raised. *) val mutable p_internal_subset = false (* true while parsing the internal subset - there are some additional * constraints for internal subsets, and because of this it must * be known whether the current declaration is contained in the * internal or external subset of the DTD. *) val mutable permit_any_content = false (* Used with `Entry_content to allow entity references outside elements and to turn off other parsing restrictions *) method private only_whitespace data = (* Checks that the string "data" contains only whitespace. On failure, * Validation_error is raised. *) if not (Pxp_lib.only_whitespace data) then raise(WF_error("Data not allowed here")); () (************************************************************************) (* Namespace processing: The following values and methods MAY be used * by the subclass, but this is completely voluntary *) val mutable ns_stack = Stack.create() (* Stack of previous ns_scope, ns_cache, ns_default_normprefix *) val mutable ns_scope = None (* The current namespace_scope *) val mutable ns_cache = StringMap.empty (* The cache mapping display prefixes to normprefixes *) val mutable ns_default_normprefix = "" (* The default normprefix, or "" if none *) (* val mutable src_norm_mapping = [ "xml", "xml" ] (* Namespace processing: Contains pairs (srcprefix, normprefix). * srcprefix = "!" is used as guard. *) val mutable default_normprefix = "" *) method private init_ns_processing (mng:namespace_manager) = let scope = new namespace_scope_impl mng None [ "xml", mng # get_primary_uri "xml" ] in ns_scope <- Some scope; ns_cache <- StringMap.empty; ns_default_normprefix <- "" method private push_src_norm_mapping (mng:namespace_manager) name attlist = (* [mng]: namespace manager * [name]: source name of element * [attlist]: source attribute list * returns quadruple (src_prefix, localname, norm_name, norm_attlist) *) (* Save state: *) Stack.push (ns_scope, ns_cache, ns_default_normprefix) ns_stack; let split_attlist = List.map (fun (name, value) -> namespace_split name, value) attlist (* TODO: Check that localname matches NCName *) in let xmlns_attlist = ref [] in let xmlns_default = ref None in let regular_attlist = ref [] in List.iter (fun ((prefix, localname), value) -> if prefix = "" && localname = "xmlns" then xmlns_default := Some value else if prefix = "xmlns" then xmlns_attlist := (localname, value) :: !xmlns_attlist else regular_attlist := (prefix, localname,value) :: !regular_attlist ) split_attlist; let mapping = ref [] in List.iter (fun (srcprefix, uri) -> let _normprefix = mng # lookup_or_add_namespace srcprefix uri in mapping := (srcprefix, uri) :: !mapping; ) !xmlns_attlist; (* Apply xmlns_default: *) ( match !xmlns_default with None -> () | Some "" -> (* Delete default namespace: *) ns_default_normprefix <- ""; mapping := ("", "") :: !mapping; | Some uri -> let normprefix = try mng # get_normprefix uri with Namespace_not_managed _ -> mng # lookup_or_add_namespace "default" uri in ns_default_normprefix <- normprefix; mapping := ("", uri) :: !mapping; ); (* Create new scope: *) let scope = if !mapping = [] then ns_scope else Some(new namespace_scope_impl mng ns_scope !mapping) in ns_scope <- scope; (* Clear cache - otherwise the new declarations wouldn't have any effect *) if !mapping <> [] then ns_cache <- StringMap.empty; (* NB. normalize_namespace_prefix uses the new values for ns_scope and ns_cache, so these vars must already be updated *) (* Normalize the regular_attlist: *) let norm_attlist = List.map (fun (prefix, localname, value) -> (prefix, localname, self # normalize_namespace_prefix prefix localname, value ) ) !regular_attlist in (* Normalize the element name: *) let prefix, localname = namespace_split name in (* TODO: Check that localname matches NCName *) let norm_name = self # normalize_namespace_prefix ~apply_default:true prefix localname in (prefix, localname, norm_name, norm_attlist) method private pop_src_norm_mapping () = (* Pop until the guard is found *) let (scope, cache, default_normprefix) = Stack.pop ns_stack in ns_scope <- scope; ns_cache <- cache; ns_default_normprefix <- default_normprefix method private normalize_namespace_prefix ?(apply_default = false) prefix localname = if String.contains localname ':' then raise(Namespace_error("Found several colons in a name")); if prefix = "" then begin (* No prefix *) if apply_default && ns_default_normprefix <> "" then ns_default_normprefix ^ ":" ^ localname else localname end else begin (* Prefix exists *) let normprefix = try StringMap.find prefix ns_cache with Not_found -> let scope = match ns_scope with Some s -> s | None -> assert false in let np = try scope # normprefix_of_display_prefix prefix with Not_found -> raise(Namespace_error ("Namespace prefix not declared: " ^ prefix)) in ns_cache <- StringMap.add prefix np ns_cache; np in normprefix ^ ":" ^ localname end method private virtual sub_parser : unit -> core_parser (************************************************************************) (* Events. The parser calls these methods, and the subclass must * define them *) method private virtual init_for_xml_body : entity_id -> unit method private virtual event_document_xmldecl : Pxp_lexer_types.prolog_token list -> unit method private virtual event_start_tag : (string*int*int) option -> string -> (string * string) list -> bool -> entity_id -> unit method private virtual event_end_tag : string -> entity_id -> unit method private virtual event_char_data : string -> unit method private virtual event_pinstr : (string*int*int) option -> string -> string -> entity_id -> unit method private virtual event_comment : (string*int*int) option -> string list -> unit (************************************************************************) initializer (* CHECKS: *) if config.encoding <> dtd # encoding then failwith("Encoding mismatch"); (********* Here the method "parse" begins. The grammar below is * transformed to a local function of this method *) method parse context (start_symbol : extended_entry) = (* extend_dtd: * Whether the DTD should be extended by ELEMENT, ATTLIST, and * NOTATION declarations or not. (True for validating mode, * false for well-formedness mode.) *) let extend_dtd = match start_symbol with `Entry_document flags -> List.mem `Extend_dtd_fully flags || List.mem `Val_mode_dtd flags | `Entry_declarations flags -> List.mem `Extend_dtd_fully flags || List.mem `Val_mode_dtd flags | `Entry_continuation st -> st.cont_extend_dtd | _ -> false in ( match start_symbol with `Entry_document flags -> if not (List.mem `Val_mode_dtd flags) then (* Instead of dtd # allow_arbitrary, because the processing * instruction survives marshalling: *) dtd # add_pinstr (new proc_instruction "pxp:dtd" "optional-element-and-notation-declarations" config.encoding); | `Entry_declarations flags -> if not (List.mem `Val_mode_dtd flags) then dtd # add_pinstr (new proc_instruction "pxp:dtd" "optional-element-and-notation-declarations" config.encoding); | _ -> () ); (* process_xmldecl: * Whether the XML declaration is parsed and the found XML version * and standalone declaration are passed to 'doc'. *) let process_xmldecl = match start_symbol with `Entry_document flags -> List.mem `Parse_xml_decl flags | `Entry_continuation st -> st.cont_process_xmldecl | _ -> false in let parse_ignored_section yy_current yy_get_next = (* A special parser which should be used after <![IGNORE[. * It parses until the corresponding ]]> is found. *) while yy_current() = Ignore do ignore(yy_get_next()); done; ( match yy_current() with Conditional_body _ -> () | _ -> raise Parsing.Parse_error; ); let en = context.manager # current_entity in let llev = ref 1 in while !llev >= 1 do let igntok = en # next_ignored_token in (* next_ignored_token: uses a special lexer that only * recognizes Conditional_begin and Conditional_end; * other character combinations are ignored. *) (* NOTE: next_ignored_token works much like yy_get_next, * but it does not set the current token! *) match igntok with Conditional_begin _ -> llev := !llev + 1 | Conditional_end _ -> llev := !llev - 1; (* Because the loop may be exited now: *) context.current_token <- igntok; | (End_entity | Eof) -> raise Parsing.Parse_error | _ -> () done; in let recode_utf8 s = (* Recode 's' to UTF-8 *) if config.encoding = `Enc_utf8 then s (* No recoding necessary *) else Netconversion.convert ~in_enc:(config.encoding :> encoding) ~out_enc:`Enc_utf8 s in let process_curly_brace tok = let s = match config.escape_contents with None -> ( match tok with Lcurly -> "{" | LLcurly -> "{{" | Rcurly -> "}" | RRcurly -> "\125\125" | _ -> assert false ) | Some f -> f tok context.manager in if s <> "" then begin if n_tags_open = 0 then self # only_whitespace s else self # event_char_data s end in let process_attribute_event tok pos = match config.escape_attributes with None -> assert false | Some f -> f tok pos context.manager in let reused_lexobj = lfactory # open_string "" in %% /* The following grammar looks similar to ocamlyacc grammars, but * ocamlyacc is actually not used to transform the grammar into a parser. * Instead, the parser generator m2parsergen is applied. * * The format of the grammar is different (see m2parsergen/README), * but I hope that you can understand most features immediately. * * The type of the parser is different: m2parsergen creates a top-down * parser while ocamlyacc generates a LALR-1 parser. * * The way the generated code is called is different: ocamlyacc produces * lots of top-level definitions whereas m2parsergen generates only * a local let-in-phrase. This is explained in the already mentioned * README file. */ /* See Pxp_core_types.ml for comments to the various tokens */ %token Begin_entity %token End_entity %token <> Comment_begin %token <> Comment_end %token Ignore %token Eq %token Rangle %token Rangle_empty %token <> Conditional_begin %token <> Conditional_body %token <> Conditional_end %token Percent %token Plus %token Star %token Bar %token Comma %token Qmark %token Pcdata %token Required %token Implied %token Fixed %token Eof %token Lcurly %token LLcurly %token Rcurly %token RRcurly %token SQuote %token DQuote %token <> Comment_material %token <> Doctype %token <> Doctype_rangle %token <> Dtd_begin %token <> Dtd_end %token <> Decl_element %token <> Decl_attlist %token <> Decl_entity %token <> Decl_notation %token <> Decl_rangle %token <> Lparen %token <> Rparen %token <> RparenPlus %token <> RparenStar %token <> RparenQmark %token <> Tag_beg %token <> Tag_end %token <> PI %token <> PI_xml %token <> Cdata %token <> CRef %token <> ERef %token <> ERef_att %token <> PERef %token <> CharData %token <> LineEnd %token <> Name %token <> Nametoken %token <> Attval %token <> Attval_nl_normalized %token <> Unparsed_string /* START SYMBOLS: * * "entry_document": parses a complete XML document (i.e. containing a * <!DOCTYPE..> and an element) * "entry_declarations": parses an "external DTD subset", i.e. a sequence * of declarations * "entry_element_content": parses a single element (no <!DOCTYPE...> allowed); * the element needs not to be the root element of the * DTD * * The functions corresponding to these symbols return always () because * they only have side-effects. */ /* SOME GENERAL COMMENTS: * * The parser does not get its tokens from the lexers directly. Instead of * this, there is an entity object between the parser and the lexers. This * object already handles: * * - References to general and parameter entities. The token stream is * modified such that tokens automatically come from the referenced entities. * External parameter entities and all general entities are embraced by * the two special tokens Begin_entity and End_entity. The parser must * check that these braces are correctly nested. */ %% entry_document(): Begin_entity $ {{ n_entities_open <- n_entities_open + 1; }} doc_xmldecl_then_misc_then_prolog_then_body() {{ (* Note: In pull parsing mode, the following code won't be executed! *) () }} /* In the following rule, we must find out whether there is an XML declaration * or not, and directly after that either "process_xmldecl" or * "process_missing_xmldecl" of the current entity must be called. * AND IT MUST BE DIRECTLY! Because of this, the invocation is carried out * in the "$" clause immediately following the first token. * * TODO: This is not enough. The first token may be a tag, and the tag * may already contain non-ASCII characters. (But in this case, the resolvers * assume UTF8, and they are right...) */ doc_xmldecl_then_misc_then_prolog_then_body(): pl:PI_xml $ {{ context.manager # current_entity # process_xmldecl pl; if process_xmldecl then self # event_document_xmldecl pl; }} misc()* doc_prolog_then_body() {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} misc() misc()* doc_prolog_then_body() {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} doctypedecl() misc()* body_start() {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} body_start() {{ () }} doc_prolog_then_body(): doctypedecl() misc()* body_start() {{ () }} | body_start() {{ () }} entry_element_content(): Begin_entity $ {{ n_entities_open <- n_entities_open + 1; }} el_xmldecl_then_misc_then_body() {{ (* Note: In pull parsing mode the following code won't be executed! *) () }} entry_content(): Begin_entity $ {{ permit_any_content <- true; n_entities_open <- n_entities_open + 1; }} el_xmldecl_then_entity_body() {{ (* Note: In pull parsing mode the following code won't be executed! *) () }} /* See comment for doc_mldecl_then_misc_then_prolog_then_body. */ el_xmldecl_then_misc_then_body(): pl:PI_xml $ {{ context.manager # current_entity # process_xmldecl pl; }} misc()* body_start() {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} misc() misc()* body_start() {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} body_start() {{ () }} el_xmldecl_then_entity_body(): pl:PI_xml $ {{ context.manager # current_entity # process_xmldecl pl; }} entity_body() {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} entity_body() {{ () }} entry_declarations(): /* Parses a sequence of declarations given by an entity. As side-effect, * the parsed declarations are put into the dtd object. * * Note: The following Begin_entity is not counted because this entity * will certainly be closed when pull parsing mode is entered. */ Begin_entity decl_xmldecl_then_body() {{ () }} | Eof {{ () }} decl_xmldecl_then_body(): /* Note: This rule is also called from declaration()! */ pl:PI_xml $ {{ context.manager # current_entity # process_xmldecl pl; }} declaration()* End_entity {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} declaration() declaration()* End_entity {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} End_entity {{ () }} misc(): pi() {{ () }} | data: CharData /* In this context, the lexers sometimes do not recognize white space; * instead CharData tokens containing white space are delivered. */ {{ self # only_whitespace data }} | Ignore {{ () }} | comment() {{ () }} /********************* DOCUMENT TYPE DECLARATION *************************/ doctypedecl(): /* parses from <!DOCTYPE to >. As side-effect, first the declarations of * the internal DTD (if any) are put into !!on_dtd, then the declarations * of the external DTD (if any) are put into this DTD object. */ doctype_entid: Doctype ws: Ignore Ignore* doctypedecl_material (doctype_entid) {{ () }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing after `DOCTYPE'")) | _ -> raise(WF_error("Bad DOCTYPE declaration")) }} /* TRICK: * ws: Ignore? Ignore* * is meant seriously. The effect is that ws becomes a boolean variable * which is true if there is an Ignore token and false otherwise. * This construct is faster than just * ws: Ignore* * in which case ws becomes an integer variable containing the number of * Ignore tokens. Counting the number of tokens is slower than only checking * the existence. * * We need the information whether there is an Ignore token (representing * white space), because white space is only obligatory if also an identifier * for the external subset is parsed; this conditional syntax constraint is * simply programmed in the body of the grammar rule. */ doctypedecl_material(doctype_entid): root_name: Name ws: Ignore? Ignore* external_subset: external_id()? Ignore* internal_subset: internal_dtd()? Ignore* doctype_rangle_entid: Doctype_rangle {{ if doctype_entid != doctype_rangle_entid then raise (Validation_error("Entities not properly nested with DOCTYPE declaration")); dtd # set_root root_name; begin match external_subset, internal_subset with None, None -> () (* no DTD means no ID *) | None, Some _ -> dtd # set_id Internal | Some id, None -> dtd # set_id (External id) | Some id, Some _ -> dtd # set_id (Derived id) end; (* Get now the external doctype declaration. Note that the internal * subset has precedence and must be read first. *) begin match external_subset with None -> () | Some id -> if not ws then raise(WF_error("Whitespace is missing after `DOCTYPE " ^ root_name ^ "'")); let resolver = context.manager # current_resolver # clone in let system_base = (context.manager # current_resolver # active_id).rid_system in let pobj = self # sub_parser() in let en = new Pxp_entity.external_entity resolver dtd "[dtd]" config.swarner config.warner id system_base false config.encoding in en # set_debugging_mode (config.debugging_mode); let mgr = new entity_manager en dtd in en # open_entity true Declaration; begin try let context = make_context mgr in let flags = (if extend_dtd then [`Extend_dtd_fully] else []) in pobj # parse context (`Entry_declarations flags); mgr # pop_entity_until en; if en # is_open then ignore(en # close_entity); with error -> (* When the error happens, the current entity can be * different than [en]. So the entity stack is muddled up, * and we need to pop entites until we find [en] again. *) let pos = mgr # position_string in mgr # pop_entity_until en; if en # is_open then ignore(en # close_entity); raise (At(pos, error)) end; end; dtd # validate }} ? {{ match !yy_position with "doctype_rangle_entid" -> raise(WF_error("`>' expected")) | _ -> raise(WF_error("Bad DOCTYPE declaration")) }} /* Note that there are no keywords for SYSTEM or PUBLIC, as these would * be difficult to recognize in the lexical contexts. Because of this, * SYSTEM/PUBLIC is parsed as name, and the rule for everything after * SYSTEM/PUBLIC is computed dynamically. */ external_id(): tok:Name $ {{ let followup = match tok with "SYSTEM" -> parse_system_id (* Apply the rule system_id (below) to parse the * rest of the ID *) | "PUBLIC" -> parse_public_id (* Apply the rule public_id (below) to parse the * rest of the ID *) | _ -> raise(WF_error("SYSTEM or PUBLIC expected")) in }} ws:Ignore Ignore* r:[followup]() {{ r }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing after " ^ tok)) | _ -> raise(WF_error("Bad SYSTEM or PUBLIC identifier")) }} system_id(): str:Unparsed_string {{ System (recode_utf8 str) }} public_id(): str1: Unparsed_string ws: Ignore Ignore* str2: Unparsed_string {{ check_public_id str1; Public(recode_utf8 str1, recode_utf8 str2) }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing between the literals of the PUBLIC identifier")) | _ -> raise(WF_error("Bad PUBLIC identifier")) }} /* The internal subset: "[" declaration* "]". While parsing the declarations * the object variable p_internal_subset must be true; however, if there * are entity references, this variable must be reset to false during * the entity. (See the rule for "declaration" below.) */ internal_dtd(): dtd_begin_entid: internal_dtd_begin() declaration()* dtd_end_entid: internal_dtd_end() {{ if dtd_begin_entid != dtd_end_entid then raise(Validation_error("Entities not properly nested with internal DTD subset")) }} ? {{ match !yy_position with "dtd_end_entid" -> raise(WF_error("`]' expected")) | _ -> raise(WF_error("Bad internal DTD subset")) }} internal_dtd_begin(): Dtd_begin {{ assert (not p_internal_subset); p_internal_subset <- true }} internal_dtd_end(): Dtd_end {{ assert p_internal_subset; p_internal_subset <- false }} declaration(): /* Parses a single declaration (or processing instruction). As side-effect * the parsed declaration is stored into the dtd object. */ elementdecl() {{ () }} | attlistdecl() {{ () }} | entid:Decl_entity ws:Ignore Ignore* _e:entitydecl(entid) {{ () }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing after ENTITY")) | "_e" -> raise(WF_error("Name or `%' expected")) | _ -> raise(WF_error("Bad entity declaration")) }} | notationdecl() {{ () }} | pi: PI {{ let target, value, ent_id = pi in let pi = new proc_instruction target value config.encoding in dtd # add_pinstr pi }} | Ignore {{ () }} | begin_entid:Comment_begin Comment_material* end_entid:Comment_end {{ if begin_entid != end_entid then raise(Validation_error("The first and the last token of comments must be in the same entity")); }} ? {{ match !yy_position with "end_entid" -> raise(WF_error("`-->' expected")) | _ -> raise(WF_error("Bad comment")) }} | Begin_entity $ {{ (* Set 'p_internal_subset' to 'false' until the matching 'end_entity' * rule is parsed. This allows unrestricted usage of parameter entities * within declarations of internal entities. *) let old_p_internal_subset = p_internal_subset in p_internal_subset <- false; }} decl_xmldecl_then_body() {{ (* Restore the old value of 'p_internal_subset'. *) p_internal_subset <- old_p_internal_subset; () }} | begin_entid:Conditional_begin $ {{ (* Check whether conditional sections are allowed at this position. *) if p_internal_subset then raise(WF_error("Restriction of the internal subset: Conditional sections not allowed")); }} Ignore* _cond:conditional_section() end_entid:Conditional_end {{ (* Check whether Conditional_begin and Conditional_end are in the same * entity. (This restriction is explained in the file SPECS.) *) if begin_entid != end_entid then raise(Validation_error("The first and the last token of conditional sections must be in the same entity (additional restriction of this parser)")); }} ? {{ match !yy_position with "end_entid" -> raise(WF_error("`>]>' expected")) | "_cond" -> raise(WF_error("INCLUDE or IGNORE expected")) | _ -> raise(WF_error("Bad conditional section")) }} /* The tokens INCLUDE/IGNORE are scanned as names, and the selection of the * right parsing rule is dynamic. * Note that parse_ignored_section is not defined by a grammar rule but * by a conventional let-binding above. */ conditional_section(): include_or_ignore:Name $ {{ let parsing_function = match include_or_ignore with "INCLUDE" -> parse_included_section (* invoke rule "included_section" below *) | "IGNORE" -> parse_ignored_section (* invoke function "parse_ignored_section" *) | _ -> raise(WF_error("INCLUDE or IGNORE expected")) in }} [ parsing_function ] () {{ () }} ? {{ ignore(!yy_position); raise(WF_error("Bad conditional section")) }} included_section(): Conditional_body declaration()* {{ () }} | Ignore Ignore* Conditional_body declaration()* {{ () }} /*************************** ELEMENT DECLARATIONS ********************/ elementdecl(): /* parses <!ELEMENT ... >. Puts the parsed element type as side-effect into * dtd. */ decl_element_entid: Decl_element $ {{ let extdecl = context.manager # current_entity_counts_as_external in }} ws1: Ignore Ignore* name: Name ws2: Ignore Ignore* content_model: contentspec() Ignore* decl_rangle_entid: Decl_rangle {{ if decl_element_entid != decl_rangle_entid then raise (Validation_error "Entities not properly nested with ELEMENT declaration"); if extend_dtd then begin let el = new dtd_element dtd name in (* It is allowed that an <!ATTLIST...> precedes the corresponding * <!ELEMENT...>. Because of this it is possible that there is already * an element called 'name' in the DTD, and we only must set the content * model of this element. *) try dtd # add_element el; el # set_cm_and_extdecl content_model extdecl; with Not_found -> (* means: there is already an element 'name' *) let el' = dtd # element name in el' # set_cm_and_extdecl content_model extdecl; (* raises Validation_error if el' already has a content model *) end }} ? {{ match !yy_position with ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) | "name" -> raise(WF_error("The name of the element is expected here")) | "content_model" -> raise(WF_error("Content model expression expected")) | "decl_rangle_entid" -> raise(WF_error("`>' expected")) | _ -> raise(WF_error("Bad element type declaration")) }} contentspec(): /* parses a content model and returns it (type content_model_type) */ name: Name /* EMPTY or ANY */ {{ match name with "EMPTY" -> Empty | "ANY" -> Any | _ -> raise(WF_error("EMPTY, ANY, or a subexpression expected")) }} | entid:Lparen Ignore* term:mixed_or_regexp(entid) {{ term }} ? {{ raise(WF_error("Bad content model expression")) }} /* Many of the following rules have an lparen_entid argument. This is the * internal ID of the entity containing the corresponding left parenthesis; * by comparing it with the ID of the entity of the right parenthesis the * contraint is implemented that both parentheses must be in the same entity. */ mixed_or_regexp(lparen_entid): re: choice_or_seq(lparen_entid) {{ Regexp re }} | m: mixed(lparen_entid) {{ m }} multiplier(): /* returns one of the multiplier symbols (?,*,+) */ Plus {{ Plus }} | Star {{ Star }} | Qmark {{ Qmark }} mixed (lparen_entid) : Pcdata Ignore* material: mixed_alternatives_top() {{ let rest, rparen_entid = material in if lparen_entid != rparen_entid then raise (Validation_error "Entities not properly nested with parentheses"); Mixed (MPCDATA :: rest) }} ? {{ raise(WF_error("Bad content model expression")) }} mixed_alternatives_top(): entid: Rparen {{ [], entid }} | entid: RparenStar {{ [], entid }} | Bar Ignore* name:Name Ignore* names:mixed_alternative()* entid:RparenStar {{ (MChild name :: names), entid }} ? {{ match !yy_position with "name" -> raise(WF_error("Name expected")) | "entid" -> raise(WF_error("`)*' expected")) | _ -> raise(WF_error("Bad content model expression")) }} mixed_alternative() : Bar Ignore* name:Name Ignore* {{ MChild name }} ? {{ match !yy_position with "name" -> raise(WF_error("Name expected")) | _ -> raise(WF_error("Bad content model expression")) }} choice_or_seq (lparen_entid): /* parses either a regular expression, or a mixed expression. Returns * Mixed spec or Regexp spec (content_model_type). * Which kind of expression (regexp or mixed) is being read is recognized * after the first subexpression has been parsed; the other subexpressions * must be of the same kind. */ re: cp() Ignore* factor: choice_or_seq_factor() {{ let (finalmark,subexpr), rparen_entid = factor in if lparen_entid != rparen_entid then raise (Validation_error "Entities not properly nested with parentheses"); (* Check that the other subexpressions are "regexp", too, and * merge them with the first. *) let re' = match subexpr with Alt [] -> re | Alt alt -> Alt (re :: alt) | Seq seq -> Seq (re :: seq) | _ -> assert false in (* Interpret the finalmark. *) match finalmark with Ignore -> re' | Plus -> Repeated1 re' | Star -> Repeated re' | Qmark -> Optional re' | _ -> assert false }} ? {{ raise(WF_error("Bad content model expression")) }} choice_or_seq_factor(): /* Parses "|<subexpr>|...)" or ",<subexpr>,...)", both forms optionally * followed by ?, *, or +. * Returns ((finalmark, expr), rparen_entid), where * - finalmark is the character after the right parenthesis or Ignore * - expr is either * Alt [] meaning that only ")" has been found * Alt non_empty_list meaning that the subexpressions are separated by '|' * Seq non_empty_list meaning that the subexpressions are separated by ',' */ entid:Rparen {{ (Ignore, Alt []), entid }} | entid:RparenPlus {{ (Plus, Alt []), entid }} | entid:RparenStar {{ (Star, Alt []), entid }} | entid:RparenQmark {{ (Qmark, Alt []), entid }} | Bar Ignore* re:cp() Ignore* factor:choice_or_seq_factor() {{ let (finalmark, subexpr), rparen_entid = factor in begin match subexpr with Alt [] -> (finalmark, (Alt [re])), rparen_entid | Alt alt -> (finalmark, (Alt (re :: alt))), rparen_entid | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences")) end }} ? {{ raise(WF_error("Bad content model expression")) }} | Comma Ignore* re:cp() Ignore* factor:choice_or_seq_factor() {{ let (finalmark, subexpr), rparen_entid = factor in begin match subexpr with Alt [] -> (finalmark, (Seq [re])), rparen_entid | Seq seq -> (finalmark, (Seq (re :: seq))), rparen_entid | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences")) end }} ? {{ raise(WF_error("Bad content model expression")) }} cp(): /* parse either a name, or a parenthesized subexpression "(...)" */ name:Name m:multiplier()? {{ match m with None -> Child name | Some Plus -> Repeated1 (Child name) | Some Star -> Repeated (Child name) | Some Qmark -> Optional (Child name) | _ -> assert false }} ? {{ raise(WF_error("Bad content model expression")) }} | entid:Lparen Ignore* m:choice_or_seq(entid) {{ m }} ? {{ raise(WF_error("Bad content model expression")) }} /********************* ATTRIBUTE LIST DECLARATION ***********************/ attlistdecl(): /* parses <!ATTLIST ... >. Enters the attribute list in dtd as side- * effect. */ decl_attlist_entid: Decl_attlist $ {{ let extdecl = context.manager # current_entity_counts_as_external in }} ws1: Ignore Ignore* el_name: Name ws: Ignore? Ignore* factor: attdef_factor() {{ let at_list, decl_rangle_entid = factor in if decl_attlist_entid != decl_rangle_entid then raise (Validation_error "Entities not properly nested with ATTLIST declaration"); if not ws && at_list <> [] then begin match at_list with (name,_,_) :: _ -> (* This is normally impossible, because the lexer demands * some other token between two names. *) raise(WF_error("Whitespace is missing before `" ^ name ^ "'")); | _ -> assert false end; if extend_dtd then begin let new_el = new dtd_element dtd el_name in (* Note that it is allowed that <!ATTLIST...> precedes the corresponding * <!ELEMENT...> declaration. In this case we add the element declaration * already to the DTD but leave the content model unspecified. *) let el = try dtd # add_element new_el; new_el with Not_found -> (* already added *) let old_el = dtd # element el_name in if old_el # attribute_names <> [] then warn config.swarner config.warner (`W_multiple_ATTLIST_declarations el_name); old_el in List.iter (fun (a_name, a_type, a_default) -> el # add_attribute a_name a_type a_default extdecl) at_list end }} ? {{ match !yy_position with "ws1" -> raise(WF_error("Whitespace is missing after ATTLIST")) | "el_name" -> raise(WF_error("The name of the element is expected here")) | "factor" -> raise(WF_error("Another attribute name or `>' expected")) | _ -> raise(WF_error("Bad attribute declaration")) }} attdef_factor(): /* parses a list of triples <name> <type> <default value> and returns the * list as (string * att_type * att_default) list. */ attdef:attdef() ws:Ignore? Ignore* factor:attdef_factor() {{ let attdef_rest, decl_rangle_entid = factor in if not ws && attdef_rest <> [] then begin match attdef_rest with (name,_,_) :: _ -> raise(WF_error("Missing whitespace before `" ^ name ^ "'")); | _ -> assert false end; (attdef :: attdef_rest), decl_rangle_entid }} ? {{ match !yy_position with | "factor" -> raise(WF_error("Another attribute name or `>' expected")) | _ -> raise(WF_error("Bad attribute declaration")) }} | entid:Decl_rangle {{ [], entid }} attdef(): /* Parses a single triple */ name: Name ws1: Ignore Ignore* tp: atttype() ws2: Ignore Ignore* default: defaultdecl() {{ (name,tp,default) }} ? {{ match !yy_position with ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) | "tp" -> raise(WF_error("Type of attribute or `(' expected")) | "default" -> raise(WF_error("#REQUIRED, #IMPLIED, #FIXED or a string literal expected")) | _ -> raise(WF_error("Bad attribute declaration")) }} atttype(): /* Parses an attribute type and returns it as att_type. */ name: Name $ {{ let followup = if name = "NOTATION" then parse_notation else parse_never in }} nota: [followup]()? {{ match name with "CDATA" -> A_cdata | "ID" -> A_id | "IDREF" -> A_idref | "IDREFS" -> A_idrefs | "ENTITY" -> A_entity | "ENTITIES" -> A_entities | "NMTOKEN" -> A_nmtoken | "NMTOKENS" -> A_nmtokens | "NOTATION" -> (match nota with None -> raise(WF_error("Error in NOTATION type (perhaps missing whitespace after NOTATION?)")) | Some n -> n ) | _ -> raise(WF_error("One of CDATA, ID, IDREF, IDREFS, ENTITY, ENTITIES, NMTOKEN, NMTOKENS, NOTATION, or a subexpression expected")) }} ? {{ raise(WF_error("Bad attribute declaration (perhaps missing whitespace after NOTATION)")) }} | Lparen Ignore* name: name_or_nametoken() Ignore* names: nmtoken_factor()* _rp: Rparen /* Enumeration */ {{ A_enum(name :: names) }} ? {{ match !yy_position with "name" -> raise(WF_error("Name expected")) | "names" -> raise(WF_error("`|' and more names expected, or `)'")) | "_rp" -> raise(WF_error("`|' and more names expected, or `)'")) | _ -> raise(WF_error("Bad enumeration type")) }} never(): /* The always failing rule */ $ {{ raise Not_found; }} Doctype /* questionable */ {{ A_cdata (* Does not matter *) }} notation(): Ignore Ignore* _lp: Lparen Ignore* name: Name Ignore* names: notation_factor()* _rp: Rparen {{ A_notation(name :: names) }} ? {{ match !yy_position with "_lp" -> raise(WF_error("`(' expected")) | "name" -> raise(WF_error("Name expected")) | "names" -> raise(WF_error("`|' and more names expected, or `)'")) | "_rp" -> raise(WF_error("`|' and more names expected, or `)'")) | _ -> raise(WF_error("Bad NOTATION type")) }} notation_factor(): /* Parse "|<name>" and return the name */ Bar Ignore* name:Name Ignore* {{ name }} ? {{ match !yy_position with "name" -> raise(WF_error("Name expected")) | _ -> raise(WF_error("Bad NOTATION type")) }} nmtoken_factor(): /* Parse "|<nmtoken>" and return the nmtoken */ Bar Ignore* n:name_or_nametoken() Ignore* {{ n }} ? {{ match !yy_position with "n" -> raise(WF_error("Nametoken expected")) | _ -> raise(WF_error("Bad enumeration type")) }} name_or_nametoken(): n:Name {{ n }} | n:Nametoken {{ n }} /* The default values must be expanded and normalized. This has been implemented * by the function expand_attvalue. */ defaultdecl(): /* Parse the default value for an attribute and return it as att_default */ Required {{ D_required }} | Implied {{ D_implied }} | Fixed ws:Ignore Ignore* str:Unparsed_string {{ D_fixed (expand_attvalue reused_lexobj dtd str false) }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing after #FIXED")) | "str" -> raise(WF_error("String literal expected")) | _ -> raise(WF_error("Bad #FIXED default value")) }} | str:Unparsed_string {{ D_default (expand_attvalue reused_lexobj dtd str false) }} /**************************** ENTITY DECLARATION ***********************/ entitydecl(decl_entity_entid): /* parses everything _after_ <!ENTITY until the matching >. The parsed * entity declaration is entered into the dtd object as side-effect. */ name: Name $ {{ let extdecl = context.manager # current_entity_counts_as_external in }} ws: Ignore Ignore* material: entitydef() Ignore* decl_rangle_entid: Decl_rangle /* A general entity */ {{ if decl_entity_entid != decl_rangle_entid then raise (Validation_error "Entities not properly nested with ENTITY declaration"); let en = (* Distinguish between * - internal entities * - external entities * - NDATA (unparsed) entities *) match material with (Some s, None, None) -> new Pxp_entity.internal_entity dtd name config.swarner config.warner s p_internal_subset false config.encoding | (None, Some xid, None) -> let resolver = context.manager # current_resolver # clone in let system_base = (context.manager # current_resolver # active_id).rid_system in new Pxp_entity.external_entity resolver dtd name config.swarner config.warner xid system_base false config.encoding | (None, Some xid, Some n) -> (new Pxp_entity.ndata_entity name xid n config.encoding :> Pxp_entity.entity) | _ -> assert false in dtd # add_gen_entity en extdecl }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing")) | "material" -> raise(WF_error("String literal or identifier expected")) | "decl_rangle_entid" -> raise(WF_error("`>' expected")) | _ -> raise(WF_error("Bad entity declaration")) }} | Percent $ {{ let extdecl = context.manager # current_entity_counts_as_external in }} ws1: Ignore Ignore* name: Name ws2: Ignore Ignore* material: pedef() Ignore* decl_rangle_entid: Decl_rangle /* A parameter entity */ {{ if decl_entity_entid != decl_rangle_entid then raise (Validation_error "Entities not properly nested with ENTITY declaration"); let en = (* Distinguish between internal and external entities *) match material with (Some s, None) -> new Pxp_entity.internal_entity dtd name config.swarner config.warner s p_internal_subset true config.encoding | (None, Some xid) -> let resolver = context.manager # current_resolver # clone in let system_base = (context.manager # current_resolver # active_id).rid_system in new Pxp_entity.external_entity resolver dtd name config.swarner config.warner xid system_base true config.encoding | _ -> assert false in (* The following two lines force that even internal entities count * as external (for the standalone check) if the declaration of * the internal entity occurs in an external entity. *) if extdecl then en # set_counts_as_external; dtd # add_par_entity en; }} ? {{ match !yy_position with ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) | "material" -> raise(WF_error("String literal or identifier expected")) | "decl_rangle_entid" -> raise(WF_error("`>' expected")) | _ -> raise(WF_error("Bad entity declaration")) }} entitydef(): /* parses the definition value of a general entity. Returns either: * - (Some s, None, None) meaning the definition of an internal entity * with (literal) value s has been found * - (None, Some x, None) meaning that an external parsed entity with * external ID x has been found * - (None, Some x, Some n) meaning that an unparsed entity with * external ID x and notations n has been found */ str:Unparsed_string {{ Some str, None, None }} | id:external_id() ws:Ignore? Ignore* decl:ndatadecl()? {{ if not ws && decl <> None then raise(WF_error("Whitespace missing before `NDATA'")); None, Some id, decl }} pedef(): /* parses the definition value of a parameter entity. Returns either: * - (Some s, None) meaning that the definition of an internal entity * with (literal) value s has been found * - (None, Some x) meaning that an external ID x has been found */ str:Unparsed_string {{ Some str, None }} | id:external_id() {{ None, Some id }} ndatadecl(): /* Parses either NDATA "string" or the empty string; returns Some "string" * in the former, None in the latter case. */ ndata:Name ws:Ignore Ignore* name:Name {{ if ndata = "NDATA" then name else raise(WF_error("NDATA expected")) }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing after NDATA")) | "name" -> raise(WF_error("Name expected")) | _ -> raise(WF_error("Bad NDATA declaration")) }} /**************************** NOTATION DECLARATION *******************/ notationdecl(): /* parses <!NOTATION ... > and enters the notation declaration into the * dtd object as side-effect */ decl_notation_entid: Decl_notation ws1: Ignore Ignore* name: Name ws2: Ignore Ignore* sys_or_public: Name /* SYSTEM or PUBLIC */ ws3: Ignore Ignore* str1: Unparsed_string ws: Ignore? Ignore* str2: Unparsed_string? Ignore* decl_rangle_entid: Decl_rangle {{ if decl_notation_entid != decl_rangle_entid then raise (Validation_error "Entities not properly nested with NOTATION declaration"); let xid = (* Note that it is allowed that PUBLIC is only followed by one * string literal *) match sys_or_public with "SYSTEM" -> if str2 <> None then raise(WF_error("SYSTEM must be followed only by one argument")); System (recode_utf8 str1) | "PUBLIC" -> begin match str2 with None -> check_public_id str1; Public(recode_utf8 str1,"") | Some p -> if not ws then raise(WF_error("Missing whitespace between the string literals of the `PUBLIC' id")); check_public_id str1; Public(recode_utf8 str1, recode_utf8 p) end | _ -> raise(WF_error("PUBLIC or SYSTEM expected")) in if extend_dtd then begin let no = new dtd_notation name xid config.encoding in dtd # add_notation no end }} ? {{ match !yy_position with ("ws1"|"ws2"|"ws3") -> raise(WF_error("Whitespace is missing")) | "name" -> raise(WF_error("Name expected")) | "sys_or_public" -> raise(WF_error("SYSTEM or PUBLIC expected")) | ("str1"|"str2") -> raise(WF_error("String literal expected")) | "decl_rangle_entid" -> raise(WF_error("`>' expected")) | _ -> raise(WF_error("Bad NOTATION declaration")) }} /****************************** ELEMENTS **************************/ /* In the following rules, the number of error rules is reduced to * improve the performance of the parser. */ body_start(): /* parses <element>...</element> misc*, i.e. exactly one element followed * optionally by white space or processing instructions. * The element is entered into the global variables as follows: * - If elstack is non-empty, the parsed element is added as new child to * the top element of the stack. * - If elstack is empty, the root_examplar object is modified rather than * that a new element is created. If additionally the variable root is * None, it is assigned Some root_examplar. * Note that the modification of the root_exemplar is done by the method * internal_init. * The reason why the root element is modified rather than newly created * is a typing requirement. It must be possible that the class of the root * is derived from the original class element_impl, i.e. the user must be * able to add additional methods. If we created a new root object, we * would have to denote to which class the new object belongs; the root * would always be an 'element_impl' object (and not a derived object). * If we instead cloned an exemplar object and modified it by the * "create" method, the root object would belong to the same class as the * exemplar (good), but the type of the parsing function would always * state that an 'element_impl' was created (because we can pass the new * object only back via a global variable). The only solution is to * modify the object that has been passed to the parsing function directly. */ $ {{ self # init_for_xml_body (context.manager # current_entity :> entity_id); let parse_fn = if pull_counter < 0 then parse_content_push else parse_content_pull in }} start_tag() [parse_fn]() {{ () }} entity_body(): /* The body of an external entity, i.e. content. */ End_entity {{ self # init_for_xml_body (context.manager # current_entity :> entity_id); n_entities_open <- n_entities_open - 1; raise End_of_parsing }} | $ {{ self # init_for_xml_body (context.manager # current_entity :> entity_id); let parse_fn = if pull_counter < 0 then parse_content_push else parse_content_pull in }} node_tag() [parse_fn]() {{ () }} entry_expr(): $ {{ self # init_for_xml_body (context.manager # current_entity :> entity_id); }} entry_expr_content() {{ () }} entry_expr_content(): start_tag() $ {{ begin try while n_tags_open > 0 do parse_node_tag yy_current yy_get_next done with Not_found -> yy_position := "expr"; raise Parsing.Parse_error end; }} {{ () }} ? {{ match !yy_position with "expr" -> raise(WF_error("Unexpected token")) | _ -> raise(WF_error("Syntax error")) }} | pi() {{ () }} | comment() {{ () }} | entry_expr_space() {{ () }} | CRef {{ raise(WF_error("Character entity not allowed here")) }} | Begin_entity entry_expr() entry_expr_space()* End_entity {{ () }} entry_expr_space(): data:CharData {{ self # only_whitespace data }} content_push(): node_tag() node_tag()* {{ () }} content_pull(): node_tag() $ {{ pull_counter <- pull_counter - 1; if pull_counter <= 0 then begin pull_counter <- pull_counter_limit; let state = { cont_context = context; cont_extend_dtd = extend_dtd; cont_process_xmldecl = process_xmldecl; } in raise(Interrupt_parsing state) end; }} content_pull() {{ () }} node_tag(): /* parses: start tags, end tags, content, or processing * instructions. That the tags are properly nested is dynamically checked. * As result, recognized elements are added to their parent elements, * content is added to the element containing it, and processing instructions * are entered into the element embracing them. (All as side-effects.) */ start_tag() {{ () }} | end_tag() {{ () }} | char_data() {{ () }} | cref() {{ () }} | pi() {{ () }} | comment() {{ () }} | Begin_entity $ {{ if n_tags_open = 0 && not permit_any_content then raise(WF_error("Entity reference not allowed here")); n_entities_open <- n_entities_open + 1; }} init_inner_entity() {{ () }} | End_entity {{ n_entities_open <- n_entities_open - 1; if n_entities_open = 0 then begin if n_tags_open <> 0 then raise(WF_error("Missing end tag")); raise End_of_parsing end; }} /* See comment for doc_mldecl_then_misc_then_prolog_then_body. */ init_inner_entity(): pl:PI_xml $ {{ context.manager # current_entity # process_xmldecl pl; }} {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} node_tag() {{ () }} start_tag(): /* parses <element attribute-values> or <element attribute-values/>. * * EFFECT: If elstack is non-empty, the element is added to the * top element of the stack as new child, and the element * is pushed on the stack. If elstack is empty, the root_exemplar is * modified and gets the parsed name and attribute list. The root_exemplar * is pushed on the stack. If additionally the variable root is empty, too, * this variable is initialized. * If the <element ... /> form has been parsed, no element is pushed * on the stack. */ /* TODO: Support for namespaces. xmlns attributes must be identified. * These atts are not added to the regular attribute list, but the * contained namespace information is interpreted. * We need a stack tracking namespace declarations (PUSH for every * start tag; POP for every end tag). Element names and attribute names * are splitted into src_prefix and localname. The src_prefix is mapped * to norm_prefix using the mentioned stack. The element and attributes * are created with names "norm_prefix:localname". * If configured, the namespace_info field of elements is set from * the current contents of the namespace stack. * New config options: * - namespace_manager = None or Some mng * - namespace_objects = true or false */ tag: Tag_beg $ {{ let position = if config.store_element_positions then Some(context.manager # position) else None in let attlist = ref [] in let attspace = ref true in }} _ws: Ignore? Ignore* attribute(attlist,attspace)* emptiness: start_tag_rangle() /* Note: it is guaranteed that there is whitespace between Tag_beg and * the name of the first attribute, because there must be some separator. * So we need not to check ws! */ {{ let name0, tag_beg_entid = tag in let name = if config.enable_name_pool_for_element_types then make_pool_string name0 else name0 in self # event_start_tag position name !attlist emptiness tag_beg_entid; if not emptiness then n_tags_open <- n_tags_open + 1; }} ? {{ match !yy_position with | "emptiness" -> raise(WF_error("`>' or `/>' expected")) | _ -> raise(WF_error("Bad start tag")) }} attribute(attlist,attspace): /* Parses name="value" */ n0:Name Ignore* Eq Ignore* v:attval() ws:Ignore? Ignore* {{ if not !attspace then raise(WF_error("Whitespace is missing before attribute `" ^ n0 ^ "'")); let n = if config.enable_name_pool_for_attribute_names then make_pool_string n0 else n0 in attlist := (n,v) :: !attlist; attspace := ws; () }} attval(): v:Attval {{ expand_attvalue reused_lexobj dtd v true }} | v:Attval_nl_normalized {{ expand_attvalue reused_lexobj dtd v false }} /* The following rules are only used if event-based attribute parsing * is enabled. In this case, neither Attval nor Attval_nl_normalized, * but SQuote, DQuote and attribute tokens are scanned. */ | SQuote $ {{ let attpos = ref 0 in }} sl:attval_squote_token(attpos)* SQuote {{ String.concat "" sl }} | DQuote $ {{ let attpos = ref 0 in }} sl:attval_dquote_token(attpos)* DQuote {{ String.concat "" sl }} attval_squote_token(attpos): name:ERef_att {{ let etext = "&" ^ name ^ ";" in let s = Pxp_aux.expand_attvalue reused_lexobj dtd etext false in attpos := !attpos + String.length s; s }} | code:CRef {{ let s = Pxp_aux.character ?swarner:config.swarner config.encoding config.warner code in attpos := !attpos + String.length s; s }} | data:CharData {{ if data.[0] = '<' then (* '<' is always a separate token *) raise (WF_error ("Attribute value contains character '<' literally")) else ( attpos := !attpos + String.length data; data ) }} | LLcurly {{ let s = process_attribute_event LLcurly !attpos in attpos := !attpos + String.length s; s }} | Lcurly {{ let s = process_attribute_event Lcurly !attpos in attpos := !attpos + String.length s; s }} | RRcurly {{ let s = process_attribute_event RRcurly !attpos in attpos := !attpos + String.length s; s }} | Rcurly {{ let s = process_attribute_event Rcurly !attpos in attpos := !attpos + String.length s; s }} | DQuote {{ incr attpos; "\"" }} attval_dquote_token(attpos): name:ERef_att {{ let etext = "&" ^ name ^ ";" in let s = Pxp_aux.expand_attvalue reused_lexobj dtd etext false in attpos := !attpos + String.length s; s }} | code:CRef {{ let s = Pxp_aux.character ?swarner:config.swarner config.encoding config.warner code in attpos := !attpos + String.length s; s }} | data:CharData {{ if data.[0] = '<' then (* '<' is always a separate token *) raise (WF_error ("Attribute value contains character '<' literally")) else ( attpos := !attpos + String.length data; data ) }} | LLcurly {{ let s = process_attribute_event LLcurly !attpos in attpos := !attpos + String.length s; s }} | Lcurly {{ let s = process_attribute_event Lcurly !attpos in attpos := !attpos + String.length s; s }} | RRcurly {{ let s = process_attribute_event RRcurly !attpos in attpos := !attpos + String.length s; s }} | Rcurly {{ let s = process_attribute_event Rcurly !attpos in attpos := !attpos + String.length s; s }} | SQuote {{ incr attpos; "'" }} start_tag_rangle(): Rangle {{ false }} | Rangle_empty {{ true }} end_tag(): /* parses </element>. * Pops the top element from the elstack and checks if it is the same * element. */ tag:Tag_end Ignore* Rangle {{ let name, tag_end_entid = tag in if n_tags_open = 0 then raise(WF_error("End-tag without start-tag")); self # event_end_tag name tag_end_entid; n_tags_open <- n_tags_open - 1; assert (n_tags_open >= 0); }} char_data(): /* Parses any literal characters not otherwise matching, and adds the * characters to the top element of elstack. * If elstack is empty, it is assumed that there is no surrounding * element, and any non-white space character is forbidden. */ data:CharData {{ if n_tags_open = 0 && not permit_any_content then (* only white space is allowed *) self # only_whitespace data else self # event_char_data data }} | Lcurly {{ process_curly_brace Lcurly }} | LLcurly {{ process_curly_brace LLcurly }} | Rcurly {{ process_curly_brace Rcurly }} | RRcurly {{ process_curly_brace RRcurly }} | data:Cdata {{ if n_tags_open = 0 && not permit_any_content then raise (WF_error("CDATA section not allowed here")); self # event_char_data data }} cref(): /* Parses &#...; and adds the character to the top element of elstack. */ code:CRef {{ if n_tags_open = 0 && not permit_any_content then (* No surrounding element: character references are not allowed *) raise(WF_error("Character reference not allowed here")); self # event_char_data (character ?swarner:config.swarner config.encoding config.warner code) }} pi(): /* Parses <?...?> (but not <?xml white-space ... ?>). * If there is a top element in elstack, the processing instruction is added * to this element. */ pi: PI {{ let position = if config.store_element_positions then Some(context.manager # position) else None in let target0,value,ent_id = pi in let target = if config.enable_name_pool_for_pinstr_targets then make_pool_string target0 else target0 in self # event_pinstr position target value ent_id }} comment(): /* Parses <!-- ... --> */ begin_entid:Comment_begin $ {{ let position = if config.enable_comment_nodes && config.store_element_positions then Some(context.manager # position) else None in }} mat: Comment_material* end_entid: Comment_end {{ if begin_entid != end_entid then raise(Validation_error("The first and the last token of comments must be in the same entity")); self # event_comment position mat }} ? {{ match !yy_position with | "end_entid" -> raise(WF_error("`-->' expected")) | _ -> raise(WF_error("Bad comment")) }} %% (* The method "parse" continues here... *) try begin match start_symbol with `Entry_document flags -> parse_entry_document context.current context.get_next | `Entry_declarations flags -> parse_entry_declarations context.current context.get_next | `Entry_element_content flags -> parse_entry_element_content context.current context.get_next | `Entry_content flags -> parse_entry_content context.current context.get_next | `Entry_expr flags -> parse_entry_expr context.current context.get_next | `Entry_continuation _ -> parse_content_pull context.current context.get_next end; raise End_of_parsing with Not_found -> raise Parsing.Parse_error | End_of_parsing -> if n_entities_open > 0 then raise(WF_error("Illegal token")); () (* The exception Interrupt_parsing is not caught here, but by the * caller *) (*********** The method "parse" ends here *************) (**********************************************************************) (* Here ends the class definition: *) end ;;