(* $Id: pxp_yacc.m2y,v 1.36 2002/03/10 23:40:52 gerd Exp $ -*- tuareg -*-
* ----------------------------------------------------------------------
* PXP: The polymorphic XML parser for Objective Caml.
* Copyright by Gerd Stolpmann. See LICENSE for details.
*)
open Parsing
open Pxp_types
open Pxp_lexer_types
open Pxp_dtd
open Pxp_entity
open Pxp_document
open Pxp_aux
open Pxp_reader
(* Some types from the interface definition: *)
exception ID_not_unique
class type [ 'ext ] index =
object
constraint 'ext = 'ext node #extension
method add : string -> 'ext node -> unit
method find : string -> 'ext node
end
type config =
{ warner : collect_warnings;
enable_pinstr_nodes : bool;
enable_super_root_node : bool;
enable_comment_nodes : bool;
drop_ignorable_whitespace : bool;
encoding : rep_encoding;
recognize_standalone_declaration : bool;
store_element_positions : bool;
idref_pass : bool;
validate_by_dfa : bool;
accept_only_deterministic_models : bool;
disable_content_validation : bool;
name_pool : pool;
enable_name_pool_for_element_types : bool;
enable_name_pool_for_attribute_names : bool;
enable_name_pool_for_attribute_values : bool;
(* enable_name_pool_for_notation_names : bool; *)
enable_name_pool_for_pinstr_targets : bool;
enable_namespace_processing : namespace_manager option;
enable_namespace_info : bool;
debugging_mode : bool;
}
type source = Pxp_dtd.source =
Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver)
| ExtID of (ext_id * Pxp_reader.resolver)
type start_symbol =
Ext_document
| Ext_declarations
| Ext_element
type context =
{ mutable current : unit -> token; (* get the current token *)
mutable get_next : unit -> token; (* go on to the next token; return it *)
mutable current_token : token; (* This is the current token *)
mutable manager : entity_manager; (* The entity manager *)
}
let make_context entity_manager =
let c =
{ current = (fun _ -> assert false);
get_next = (fun _ -> assert false);
current_token = Eof;
manager = entity_manager;
}
in
(* Note that the function which is stored in get_next_ref can be changed
* as a side-effect when an entity is opened or closed. The function in
* c.get_next must be programmed such that always the current "get_next"
* function is executed.
*)
let get_next_ref = entity_manager # yy_get_next_ref in
c.current <- (fun () -> c.current_token);
c.get_next <- (fun () -> let tok = !get_next_ref() in
c.current_token <- tok;
tok);
ignore(c.get_next());
c
;;
let from_channel ?(alt = []) ?system_encoding ?id:init_id ?fixenc ch =
let channel_id = allocate_private_id() in
match init_id with
None ->
(* The simple case: already implemented by resolve_read_this_channel *)
let r = new resolve_read_this_channel
~id: (Private channel_id)
?fixenc
ch
in
ExtID(Private channel_id, new combine (r :: alt))
| Some (System sysname) ->
(* Here we use a specially configured resolve_as_file. The private
* ID channel_id is mapped to the URL sysid and to the channel
* ch
*)
let url_syntax =
{ Neturl.null_url_syntax with
Neturl.url_enable_scheme = Neturl.Url_part_allowed;
Neturl.url_enable_host = Neturl.Url_part_allowed;
Neturl.url_enable_path = Neturl.Url_part_required;
Neturl.url_accepts_8bits = true;
}
in
let url =
try
Neturl.url_of_string url_syntax sysname
(* may raise Malformed_URL *)
with
Neturl.Malformed_URL ->
failwith "Pxp_yacc.from_channel: Bad ~id option (malformed URL)"
in
let r =
new resolve_as_file
?system_encoding
~map_private_id: (fun pid ->
if pid = channel_id then
url
else raise Not_competent)
~open_private_id: (fun pid ->
if pid = channel_id then
ch, fixenc
else
raise Not_competent
)
()
in
ExtID(Private channel_id, new combine (r :: alt))
| Some _ ->
failwith "Pxp_yacc.from_channel: Bad ~id option (must be System)"
;;
let from_file ?(alt = []) ?system_encoding ?enc utf8_filename =
let r =
new resolve_as_file
?system_encoding:system_encoding
()
in
let url = make_file_url
?system_encoding
?enc
utf8_filename in
let xid = System (Neturl.string_of_url url) in
ExtID(xid, new combine (r :: alt))
;;
let from_string ?fixenc s =
let r =
new resolve_read_this_string ?fixenc:fixenc s in
ExtID(Anonymous, r)
;;
(* An array based stack implementation. Advantage: Constant memory
* consumption.
*)
type 't array_stack =
{ mutable stack_array : 't array;
mutable stack_top : int;
mutable stack_null : 't;
}
;;
let stack_create null =
(* null: a dummy element *)
{ stack_array = Array.create 100 null;
stack_top = -1;
stack_null = null;
}
;;
let stack_push x st =
let top = st.stack_top + 1 in
if top >= Array.length st.stack_array then
st.stack_array <- Array.append
st.stack_array (Array.create 100 st.stack_null);
st.stack_array.(top) <- x;
st.stack_top <- top
;;
let stack_top st =
let top = st.stack_top in
if top >= 0 then
st.stack_array.(top)
else
raise Stack.Empty
;;
let stack_pop st =
let top = st.stack_top in
if top >= 0 then begin
let x = st.stack_array.(top) in
st.stack_array.(top) <- st.stack_null;
st.stack_top <- top-1;
x
end
else
raise Stack.Empty
;;
(**********************************************************************)
class ['ext] parser_object
(init_doc:'ext document) init_dtd init_extend_dtd init_config init_resolver init_spec
init_process_xmldecl transform_dtd id_index
=
object (self)
(* Note that the 'ext parameter has been the motivation to make the
* parser a class.
*)
val mutable dtd = init_dtd
(* The DTD being parsed; or the DTD currently assumed *)
val extend_dtd = init_extend_dtd
(* Whether the DTD should be extended by ELEMENT, ATTLIST, and
* NOTATION declarations or not. (True for validating mode,
* false for well-formedness mode.)
*)
val transform_dtd = transform_dtd
(* A function transforming the DTD *)
val id_index = (id_index : 'ext index option)
(* The ID index or None *)
val process_xmldecl = init_process_xmldecl
(* Whether the XML declaration is parsed and the found XML version
* and standalone declaration are passed to 'doc'.
*)
val lexerset = Pxp_lexers.get_lexer_set (init_config.encoding)
val doc = init_doc
(* The current document *)
method doc = (doc : 'ext document)
val resolver = init_resolver
(* The resolver for external IDs *)
val config = init_config
(* The current configuration *)
val elstack =
let null_node = get_data_exemplar init_spec in
let null_id = (null_node :> entity_id) in
let null = (null_node, "", null_id) in
(stack_create null : ('ext node * string * entity_id) array_stack)
(* The element stack containing all open elements, i.e. elements that
* have begun by a start tag but that have not been finished (end tag).
* If the parser sees a start tag, it creates the element and pushes it
* on top of this stack. If the parser recognizes an end tag, it pulls
* one element from the stack and checks if it has the same name as
* given with the end tag.
*
* At initialization time, a special element is pushed on the stack,
* the so-called super root. It is always the bottommost
* element of the stack, and serves as a guard.
* [See "initializer" below.]
*)
method current =
(* Get the top element of the element stack *)
try
let (x,_,_) = stack_top elstack in x
with
Stack.Empty -> assert false
(* Not possible, because the super root is always the element
* at the bottom of the stack.
*)
val mutable n_tags_open = 0
(* Number of begin tags that have been parsed and whose corresponding
* end tags have not yet been parsed
*)
val mutable p_internal_subset = false
(* true while parsing the internal subset - there are some additional
* constraints for internal subsets, and because of this it must
* be known whether the current declaration is contained in the
* internal or external subset of the DTD.
*)
val mutable root = None
(* Contains the root element (topmost element) while it is being parsed
* and after it has been parsed.
* This variable is None before the root element is seen.
*)
method root = root
val spec = init_spec
(* A hashtable that contains exemplar objects for the various element
* types. If an element is parsed, the exemplar is looked up and
* "cloned" (by the "create" method)
*)
val mutable current_data = []
val mutable current_string = ""
(* Collect character data. *)
method collect_data s =
(* Collects the character material 's' *)
if String.length current_string = 0 then
current_string <- s
else
current_data <- s :: current_data
method save_data =
(* Puts the material collected in 'current_data' into a new
* node, and appends this node as new sub node to 'current'
*)
let add_node d =
let cur = self # current in
match cur # classify_data_node d with
CD_normal
| CD_other ->
cur # append_node d
| CD_empty ->
()
| CD_ignorable ->
if not config.drop_ignorable_whitespace then
cur # append_node d
| CD_error e ->
raise e
in
match current_data with
[] ->
if String.length current_string > 0 then
add_node (create_data_node spec dtd current_string);
current_string <- "";
| [ str ] ->
(* assertion: current_string <> "" *)
let s = if str = "" then current_string else current_string ^ str in
add_node (create_data_node spec dtd s);
current_string <- "";
current_data <- []
| _ ->
let accu = ref (String.length current_string) in
List.iter (fun s -> accu := !accu + String.length s) current_data;
let str = String.create !accu in
let pos = ref (!accu) in
List.iter
(fun s ->
let l = String.length s in
pos := !pos - l;
String.blit s 0 str !pos l
)
current_data;
String.blit current_string 0 str 0 (String.length current_string);
add_node (create_data_node spec dtd str);
current_string <- "";
current_data <- []
method only_whitespace data =
(* Checks that the string "data" contains only whitespace. On failure,
* Validation_error is raised.
*)
if not (Pxp_lib.only_whitespace data) then
raise(WF_error("Data not allowed here"));
()
val mutable src_norm_mapping = [ "xml", "xml" ]
(* Namespace processing: Contains pairs (srcprefix, normprefix).
* srcprefix = "!" is used as guard.
*)
val mutable default_normprefix = ""
(* Namespace_processing: The default normprefix, or "" if none *)
method pop_src_norm_mapping () =
let rec pop m =
match m with
[] ->
assert false
| ("!",d)::m' ->
default_normprefix <- d;
src_norm_mapping <- m'
| (_,_)::m' ->
pop m'
in
pop src_norm_mapping
method normalize_namespace_prefix ?(apply_default = false) prefix localname =
if String.contains localname ':' then
raise(Namespace_error("Found several colons in a name"));
if prefix = "" then begin
(* No prefix *)
if apply_default && default_normprefix <> "" then
default_normprefix ^ ":" ^ localname
else
localname
end
else begin
(* Prefix exists *)
let normprefix =
try List.assoc prefix src_norm_mapping
with
Not_found ->
raise(Namespace_error ("Namespace prefix not declared: " ^ prefix))
in
normprefix ^ ":" ^ localname
end
val mutable init_done = false (* element stack initialized? *)
val mutable early_material = [] (* saved material before init_done *)
(* Call the following methods for comments and processing instructions
* that occur before the element stack is initialized
*)
method private add_early_comment position c =
assert(not init_done);
early_material <- early_material @ [ position, `Comment c ]
method private add_early_pinstr position pi =
assert(not init_done);
early_material <- early_material @ [ position, `PI pi ]
method private add_early_pinstr_node position pi =
assert(not init_done);
early_material <- early_material @ [ position, `PI_node pi ]
method private init_for_xml_body() =
if not init_done then begin
dtd <- transform_dtd dtd;
(* Initialize the element stack: *)
let super_root =
if config.enable_super_root_node then begin
let sr = create_super_root_node spec dtd in
(* Add early_material to the super root node: *)
List.iter
(function
(p, `Comment c) ->
let node = create_comment_node ?position:p spec dtd c in
sr # append_node node
| (p, `PI pi) ->
sr # add_pinstr pi
| (p, `PI_node pi) ->
let node = create_pinstr_node ?position:p spec dtd pi in
sr # append_node node
)
early_material;
sr
end
else
(* because spec may not contain an exemplar for the super root: *)
create_no_node spec dtd
in
early_material <- [];
(* Move the super root or the emulation to the stack: *)
stack_push (super_root, "", (self :> entity_id)) elstack;
init_done <- true;
end
initializer
(* CHECKS: *)
if config.encoding <> dtd # encoding then
failwith("Encoding mismatch");
(* --- Initialize 'elstack': Push the super-root on the stack. *)
(* (This is now done later, in the contents_start rule) *)
(********* Here the method "parse" begins. The grammar below is
* transformed to a local function of this method
*)
method parse context start_symbol =
let parse_ignored_section yy_current yy_get_next =
(* A special parser which should be used after <![IGNORE[.
* It parses until the corresponding ]]> is found.
*)
while yy_current() = Ignore do
ignore(yy_get_next());
done;
( match yy_current() with
Conditional_body _ -> ()
| _ -> raise Parsing.Parse_error;
);
let en = context.manager # current_entity in
let llev = ref 1 in
while !llev >= 1 do
let igntok = en # next_ignored_token in
(* next_ignored_token: uses a special lexer that only
* recognizes Conditional_begin and Conditional_end;
* other character combinations are ignored.
*)
(* NOTE: next_ignored_token works much like yy_get_next,
* but it does not set the current token!
*)
match igntok with
Conditional_begin _ ->
llev := !llev + 1
| Conditional_end _ ->
llev := !llev - 1;
(* Because the loop may be exited now: *)
context.current_token <- igntok;
| (End_entity | Eof) ->
raise Parsing.Parse_error
| _ ->
()
done;
in
let check_and_parse_xmldecl xmldecl =
if process_xmldecl then begin
let v, _, s = decode_doc_xml_pi (decode_xml_pi xmldecl) in
check_version_num v;
doc # init_xml_version v;
let v = match s with
None -> false
| Some "yes" -> true
| Some "no" -> false
| _ -> raise (WF_error("Illegal 'standalone' declaration"))
in
if config.recognize_standalone_declaration then
dtd # set_standalone_declaration v
end
in
let recode_utf8 s =
(* Recode 's' to UTF-8 *)
if config.encoding = `Enc_utf8 then
s (* No recoding necessary *)
else
Netconversion.recode_string
~in_enc:(config.encoding :> encoding) ~out_enc:`Enc_utf8 s
in
let make_pool_string = pool_string config.name_pool in
(*
let rec start_tag_check_attlist al =
match al with
(nv1, num1) :: al' ->
if not num1 && al' <> [] then begin
match al with
((n1,_),_) :: ((n2,_),_) :: _ ->
raise(WF_error("Whitespace is missing between attributes `" ^
n1 ^ "' and `" ^ n2 ^ "'"))
| _ -> assert false
end;
start_tag_check_attlist al'
| [] -> ()
in
*)
let reused_lexbuf = Pxp_lexing.from_string "" in
%%
/* The following grammar looks similar to ocamlyacc grammars, but
* ocamlyacc is actually not used to transform the grammar into a parser.
* Instead, the parser generator m2parsergen is applied.
*
* The format of the grammar is different (see m2parsergen/README),
* but I hope that you can understand most features immediately.
*
* The type of the parser is different: m2parsergen creates a top-down
* parser while ocamlyacc generates a LALR-1 parser.
*
* The way the generated code is called is different: ocamlyacc produces
* lots of top-level definitions whereas m2parsergen generates only
* a local let-in-phrase. This is explained in the already mentioned
* README file.
*/
/* See Pxp_types.ml for comments to the various tokens */
%token Begin_entity
%token End_entity
%token <> Comment_begin
%token <> Comment_end
%token Ignore
%token Eq
%token Rangle
%token Rangle_empty
%token <> Conditional_begin
%token <> Conditional_body
%token <> Conditional_end
%token Percent
%token Plus
%token Star
%token Bar
%token Comma
%token Qmark
%token Pcdata
%token Required
%token Implied
%token Fixed
%token Eof
%token <> Comment_material
%token <> Doctype
%token <> Doctype_rangle
%token <> Dtd_begin
%token <> Dtd_end
%token <> Decl_element
%token <> Decl_attlist
%token <> Decl_entity
%token <> Decl_notation
%token <> Decl_rangle
%token <> Lparen
%token <> Rparen
%token <> RparenPlus
%token <> RparenStar
%token <> RparenQmark
%token <> Tag_beg
%token <> Tag_end
%token <> PI
%token <> PI_xml
%token <> Cdata
%token <> CRef
%token <> ERef
%token <> PERef
%token <> CharData
%token <> LineEnd
%token <> Name
%token <> Nametoken
%token <> Attval
%token <> Attval_nl_normalized
%token <> Unparsed_string
/* START SYMBOLS:
*
* "ext_document": parses a complete XML document (i.e. containing a
* <!DOCTYPE..> and an element)
* "ext_declarations": parses an "external DTD subset", i.e. a sequence
* of declarations
* "ext_element": parses a single element (no <!DOCTYPE...> allowed);
* the element needs not to be the root element of the
* DTD
*
* The functions corresponding to these symbols return always () because
* they only have side-effects.
*/
/* SOME GENERAL COMMENTS:
*
* The parser does not get its tokens from the lexers directly. Instead of
* this, there is an entity object between the parser and the lexers. This
* object already handles:
*
* - References to general and parameter entities. The token stream is
* modified such that tokens automatically come from the referenced entities.
* External parameter entities and all general entities are embraced by
* the two special tokens Begin_entity and End_entity. The parser must
* check that these braces are correctly nested.
*/
%%
ext_document():
Begin_entity
doc_xmldecl_then_misc_then_prolog_then_rest() End_entity
{{
if n_tags_open <> 0 then
raise(WF_error("Missing end tag"))
}}
/* In the following rule, we must find out whether there is an XML declaration
* or not, and directly after that either "process_xmldecl" or
* "process_missing_xmldecl" of the current entity must be called.
* AND IT MUST BE DIRECTLY! Because of this, the invocation is carried out
* in the "$" clause immediately following the first token.
*
* TODO: This is not enough. The first token may be a tag, and the tag
* may already contain non-ASCII characters. (But in this case, the resolvers
* assume UTF8, and they are right...)
*/
doc_xmldecl_then_misc_then_prolog_then_rest():
pl:PI_xml
$ {{ context.manager # current_entity # process_xmldecl pl;
check_and_parse_xmldecl pl;
}}
misc()* doc_prolog_then_rest()
{{ () }}
| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
misc() misc()* doc_prolog_then_rest()
{{ () }}
| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
doctypedecl() misc()* contents_start()
{{ () }}
| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
contents_start()
{{ () }}
doc_prolog_then_rest():
doctypedecl() misc()* contents_start()
{{ () }}
| contents_start()
{{ () }}
ext_element():
Begin_entity el_xmldecl_then_misc_then_rest() End_entity
{{
if n_tags_open <> 0 then
raise(WF_error("Missing end tag"))
}}
/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */
el_xmldecl_then_misc_then_rest():
pl:PI_xml
$ {{ context.manager # current_entity # process_xmldecl pl; }}
misc()* contents_start()
{{ () }}
| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
misc() misc()* contents_start()
{{ () }}
| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
contents_start()
{{ () }}
ext_declarations():
/* Parses a sequence of declarations given by an entity. As side-effect,
* the parsed declarations are put into the dtd object.
*/
Begin_entity decl_xmldecl_then_rest()
{{ () }}
| Eof
{{ () }}
decl_xmldecl_then_rest():
/* Note: This rule is also called from declaration()! */
pl:PI_xml
$ {{ context.manager # current_entity # process_xmldecl pl;
}}
declaration()* End_entity
{{ () }}
| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
declaration() declaration()* End_entity
{{ () }}
| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
End_entity
{{ () }}
misc():
pi()
{{ () }}
| data: CharData
/* In this context, the lexers sometimes do not recognize white space;
* instead CharData tokens containing white space are delivered.
*/
{{ self # only_whitespace data }}
| Ignore
{{ () }}
| comment()
{{ () }}
/********************* DOCUMENT TYPE DECLARATION *************************/
doctypedecl():
/* parses from <!DOCTYPE to >. As side-effect, first the declarations of
* the internal DTD (if any) are put into !!on_dtd, then the declarations
* of the external DTD (if any) are put into this DTD object.
*/
doctype_entid: Doctype
ws: Ignore Ignore*
doctypedecl_material (doctype_entid)
{{ () }}
? {{ match !yy_position with
"ws" -> raise(WF_error("Whitespace is missing after `DOCTYPE'"))
| _ -> raise(WF_error("Bad DOCTYPE declaration"))
}}
/* TRICK:
* ws: Ignore? Ignore*
* is meant seriously. The effect is that ws becomes a boolean variable
* which is true if there is an Ignore token and false otherwise.
* This construct is faster than just
* ws: Ignore*
* in which case ws becomes an integer variable containing the number of
* Ignore tokens. Counting the number of tokens is slower than only checking
* the existence.
*
* We need the information whether there is an Ignore token (representing
* white space), because white space is only obligatory if also an identifier
* for the external subset is parsed; this conditional syntax constraint is
* simply programmed in the body of the grammar rule.
*/
doctypedecl_material(doctype_entid):
root_name: Name
ws: Ignore? Ignore*
external_subset: external_id()?
Ignore*
internal_subset: internal_dtd()?
Ignore*
doctype_rangle_entid: Doctype_rangle
{{
if doctype_entid != doctype_rangle_entid then
raise (Validation_error("Entities not properly nested with DOCTYPE declaration"));
dtd # set_root root_name;
begin match external_subset, internal_subset with
None, None -> () (* no DTD means no ID *)
| None, Some _ -> dtd # set_id Internal
| Some id, None -> dtd # set_id (External id)
| Some id, Some _ -> dtd # set_id (Derived id)
end;
(* Get now the external doctype declaration. Note that the internal
* subset has precedence and must be read first.
*)
begin match external_subset with
None -> ()
| Some id ->
if not ws then
raise(WF_error("Whitespace is missing after `DOCTYPE " ^
root_name ^ "'"));
let r' = resolver # clone in
let pobj =
new parser_object
(new document config.warner config.encoding)
dtd
extend_dtd
config
r'
spec
process_xmldecl
(fun x -> x)
None
in
let en = new external_entity r' dtd "[dtd]"
config.warner id false
config.encoding
in
en # set_debugging_mode (config.debugging_mode);
let mgr = new entity_manager en in
en # open_entity true Declaration;
try
let context = make_context mgr in
pobj # parse context Ext_declarations;
ignore(en # close_entity);
with
error ->
ignore(en # close_entity);
r' # close_all;
let pos = mgr # position_string in
raise (At(pos, error))
end;
dtd # validate
}}
? {{
match !yy_position with
"doctype_rangle_entid" -> raise(WF_error("`>' expected"))
| _ -> raise(WF_error("Bad DOCTYPE declaration"))
}}
/* Note that there are no keywords for SYSTEM or PUBLIC, as these would
* be difficult to recognize in the lexical contexts. Because of this,
* SYSTEM/PUBLIC is parsed as name, and the rule for everything after
* SYSTEM/PUBLIC is computed dynamically.
*/
external_id():
tok:Name
$ {{
let followup =
match tok with
"SYSTEM" -> parse_system_id
(* Apply the rule system_id (below) to parse the
* rest of the ID
*)
| "PUBLIC" -> parse_public_id
(* Apply the rule public_id (below) to parse the
* rest of the ID
*)
| _ -> raise(WF_error("SYSTEM or PUBLIC expected"))
in
}}
ws:Ignore Ignore*
r:[followup]()
{{ r }}
? {{ match !yy_position with
"ws" -> raise(WF_error("Whitespace is missing after " ^ tok))
| _ -> raise(WF_error("Bad SYSTEM or PUBLIC identifier"))
}}
system_id():
str:Unparsed_string
{{ System (recode_utf8 str) }}
public_id():
str1: Unparsed_string
ws: Ignore Ignore*
str2: Unparsed_string
{{ check_public_id str1;
Public(recode_utf8 str1, recode_utf8 str2)
}}
? {{ match !yy_position with
"ws" -> raise(WF_error("Whitespace is missing between the literals of the PUBLIC identifier"))
| _ -> raise(WF_error("Bad PUBLIC identifier"))
}}
/* The internal subset: "[" declaration* "]". While parsing the declarations
* the object variable p_internal_subset must be true; however, if there
* are entity references, this variable must be reset to false during
* the entity. (See the rule for "declaration" below.)
*/
internal_dtd():
dtd_begin_entid: internal_dtd_begin()
declaration()*
dtd_end_entid: internal_dtd_end()
{{
if dtd_begin_entid != dtd_end_entid then
raise(Validation_error("Entities not properly nested with internal DTD subset"))
}}
? {{ match !yy_position with
"dtd_end_entid" -> raise(WF_error("`]' expected"))
| _ -> raise(WF_error("Bad internal DTD subset"))
}}
internal_dtd_begin():
Dtd_begin
{{ assert (not p_internal_subset);
p_internal_subset <- true }}
internal_dtd_end():
Dtd_end
{{ assert p_internal_subset;
p_internal_subset <- false }}
declaration():
/* Parses a single declaration (or processing instruction). As side-effect
* the parsed declaration is stored into the dtd object.
*/
elementdecl()
{{ () }}
| attlistdecl()
{{ () }}
| entid:Decl_entity ws:Ignore Ignore* e:entitydecl(entid)
{{ () }}
? {{ match !yy_position with
"ws" -> raise(WF_error("Whitespace is missing after ENTITY"))
| "e" -> raise(WF_error("Name or `%' expected"))
| _ -> raise(WF_error("Bad entity declaration"))
}}
| notationdecl()
{{ () }}
| pi: PI
{{ let target, value = pi in
let pi = new proc_instruction target value config.encoding in
dtd # add_pinstr pi
}}
| Ignore
{{ () }}
| begin_entid:Comment_begin Comment_material* end_entid:Comment_end
{{
if begin_entid != end_entid then
raise(Validation_error("The first and the last token of comments must be in the same entity"));
}}
? {{ match !yy_position with
"end_entid" -> raise(WF_error("`-->' expected"))
| _ -> raise(WF_error("Bad comment"))
}}
| Begin_entity
$ {{ (* Set 'p_internal_subset' to 'false' until the matching 'end_entity'
* rule is parsed. This allows unrestricted usage of parameter entities
* within declarations of internal entities.
*)
let old_p_internal_subset = p_internal_subset in
p_internal_subset <- false;
}}
decl_xmldecl_then_rest()
{{ (* Restore the old value of 'p_internal_subset'. *)
p_internal_subset <- old_p_internal_subset;
()
}}
| begin_entid:Conditional_begin
$ {{ (* Check whether conditional sections are allowed at this position. *)
if p_internal_subset then
raise(WF_error("Restriction of the internal subset: Conditional sections not allowed"));
}}
Ignore*
cond:conditional_section() end_entid:Conditional_end
{{ (* Check whether Conditional_begin and Conditional_end are in the same
* entity. (This restriction is explained in the file SPECS.)
*)
if begin_entid != end_entid then
raise(Validation_error("The first and the last token of conditional sections must be in the same entity (additional restriction of this parser)"));
}}
? {{ match !yy_position with
"end_entid" -> raise(WF_error("`>]>' expected"))
| "cond" -> raise(WF_error("INCLUDE or IGNORE expected"))
| _ -> raise(WF_error("Bad conditional section"))
}}
/* The tokens INCLUDE/IGNORE are scanned as names, and the selection of the
* right parsing rule is dynamic.
* Note that parse_ignored_section is not defined by a grammar rule but
* by a conventional let-binding above.
*/
conditional_section():
include_or_ignore:Name
$ {{ let parsing_function =
match include_or_ignore with
"INCLUDE" -> parse_included_section
(* invoke rule "included_section" below *)
| "IGNORE" -> parse_ignored_section
(* invoke function "parse_ignored_section" *)
| _ -> raise(WF_error("INCLUDE or IGNORE expected"))
in
}}
[ parsing_function ] ()
{{ () }}
? {{ raise(WF_error("Bad conditional section")) }}
included_section():
Conditional_body declaration()*
{{ () }}
| Ignore Ignore* Conditional_body declaration()*
{{ () }}
/*************************** ELEMENT DECLARATIONS ********************/
elementdecl():
/* parses <!ELEMENT ... >. Puts the parsed element type as side-effect into
* dtd.
*/
decl_element_entid: Decl_element
$ {{ let extdecl = context.manager # current_entity_counts_as_external in
}}
ws1: Ignore Ignore*
name: Name
ws2: Ignore Ignore*
content_model: contentspec()
Ignore*
decl_rangle_entid: Decl_rangle
{{
if decl_element_entid != decl_rangle_entid then
raise (Validation_error "Entities not properly nested with ELEMENT declaration");
if extend_dtd then begin
let el = new dtd_element dtd name in
(* It is allowed that an <!ATTLIST...> precedes the corresponding
* <!ELEMENT...>. Because of this it is possible that there is already
* an element called 'name' in the DTD, and we only must set the content
* model of this element.
*)
try
dtd # add_element el;
el # set_cm_and_extdecl content_model extdecl;
with
Not_found -> (* means: there is already an element 'name' *)
let el' = dtd # element name in
el' # set_cm_and_extdecl content_model extdecl;
(* raises Validation_error if el' already has a content model *)
end
}}
? {{ match !yy_position with
("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
| "name" -> raise(WF_error("The name of the element is expected here"))
| "content_model" -> raise(WF_error("Content model expression expected"))
| "decl_rangle_entid" -> raise(WF_error("`>' expected"))
| _ -> raise(WF_error("Bad element type declaration"))
}}
contentspec():
/* parses a content model and returns it (type content_model_type) */
name: Name /* EMPTY or ANY */
{{ match name with
"EMPTY" -> Empty
| "ANY" -> Any
| _ -> raise(WF_error("EMPTY, ANY, or a subexpression expected"))
}}
| entid:Lparen Ignore* term:mixed_or_regexp(entid)
{{ term }}
? {{ raise(WF_error("Bad content model expression")) }}
/* Many of the following rules have an lparen_entid argument. This is the
* internal ID of the entity containing the corresponding left parenthesis;
* by comparing it with the ID of the entity of the right parenthesis the
* contraint is implemented that both parentheses must be in the same entity.
*/
mixed_or_regexp(lparen_entid):
re: choice_or_seq(lparen_entid)
{{ Regexp re }}
| m: mixed(lparen_entid)
{{ m }}
multiplier():
/* returns one of the multiplier symbols (?,*,+) */
Plus
{{ Plus }}
| Star
{{ Star }}
| Qmark
{{ Qmark }}
mixed (lparen_entid) :
Pcdata
Ignore*
material: mixed_alternatives_top()
{{
let rest, rparen_entid = material in
if lparen_entid != rparen_entid then
raise (Validation_error "Entities not properly nested with parentheses");
Mixed (MPCDATA :: rest)
}}
? {{ raise(WF_error("Bad content model expression")) }}
mixed_alternatives_top():
entid: Rparen
{{ [], entid }}
| entid: RparenStar
{{ [], entid }}
| Bar Ignore* name:Name Ignore* names:mixed_alternative()* entid:RparenStar
{{
(MChild name :: names), entid
}}
? {{ match !yy_position with
"name" -> raise(WF_error("Name expected"))
| "entid" -> raise(WF_error("`)*' expected"))
| _ -> raise(WF_error("Bad content model expression"))
}}
mixed_alternative() :
Bar Ignore* name:Name Ignore*
{{ MChild name }}
? {{ match !yy_position with
"name" -> raise(WF_error("Name expected"))
| _ -> raise(WF_error("Bad content model expression"))
}}
choice_or_seq (lparen_entid):
/* parses either a regular expression, or a mixed expression. Returns
* Mixed spec or Regexp spec (content_model_type).
* Which kind of expression (regexp or mixed) is being read is recognized
* after the first subexpression has been parsed; the other subexpressions
* must be of the same kind.
*/
re: cp()
Ignore*
factor: choice_or_seq_factor()
{{
let (finalmark,subexpr), rparen_entid = factor in
if lparen_entid != rparen_entid then
raise (Validation_error "Entities not properly nested with parentheses");
(* Check that the other subexpressions are "regexp", too, and
* merge them with the first.
*)
let re' =
match subexpr with
Alt [] -> re
| Alt alt -> Alt (re :: alt)
| Seq seq -> Seq (re :: seq)
| _ -> assert false
in
(* Interpret the finalmark. *)
match finalmark with
Ignore -> re'
| Plus -> Repeated1 re'
| Star -> Repeated re'
| Qmark -> Optional re'
| _ -> assert false
}}
? {{ raise(WF_error("Bad content model expression")) }}
choice_or_seq_factor():
/* Parses "|<subexpr>|...)" or ",<subexpr>,...)", both forms optionally
* followed by ?, *, or +.
* Returns ((finalmark, expr), rparen_entid), where
* - finalmark is the character after the right parenthesis or Ignore
* - expr is either
* Alt [] meaning that only ")" has been found
* Alt non_empty_list meaning that the subexpressions are separated by '|'
* Seq non_empty_list meaning that the subexpressions are separated by ','
*/
entid:Rparen
{{ (Ignore, Alt []), entid }}
| entid:RparenPlus
{{ (Plus, Alt []), entid }}
| entid:RparenStar
{{ (Star, Alt []), entid }}
| entid:RparenQmark
{{ (Qmark, Alt []), entid }}
| Bar Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
{{
let (finalmark, subexpr), rparen_entid = factor in
begin match subexpr with
Alt [] -> (finalmark, (Alt [re])), rparen_entid
| Alt alt -> (finalmark, (Alt (re :: alt))), rparen_entid
| _ -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
end
}}
? {{ raise(WF_error("Bad content model expression")) }}
| Comma Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
{{
let (finalmark, subexpr), rparen_entid = factor in
begin match subexpr with
Alt [] -> (finalmark, (Seq [re])), rparen_entid
| Seq seq -> (finalmark, (Seq (re :: seq))), rparen_entid
| _ -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
end
}}
? {{ raise(WF_error("Bad content model expression")) }}
cp():
/* parse either a name, or a parenthesized subexpression "(...)" */
name:Name m:multiplier()?
{{ match m with
None -> Child name
| Some Plus -> Repeated1 (Child name)
| Some Star -> Repeated (Child name)
| Some Qmark -> Optional (Child name)
| _ -> assert false
}}
? {{ raise(WF_error("Bad content model expression")) }}
| entid:Lparen Ignore* m:choice_or_seq(entid)
{{ m }}
? {{ raise(WF_error("Bad content model expression")) }}
/********************* ATTRIBUTE LIST DECLARATION ***********************/
attlistdecl():
/* parses <!ATTLIST ... >. Enters the attribute list in dtd as side-
* effect.
*/
decl_attlist_entid: Decl_attlist
$ {{ let extdecl = context.manager # current_entity_counts_as_external in
}}
ws1: Ignore Ignore*
el_name: Name
ws: Ignore? Ignore*
factor: attdef_factor()
{{
let at_list, decl_rangle_entid = factor in
if decl_attlist_entid != decl_rangle_entid then
raise (Validation_error "Entities not properly nested with ATTLIST declaration");
if not ws && at_list <> [] then begin
match at_list with
(name,_,_) :: _ ->
(* This is normally impossible, because the lexer demands
* some other token between two names.
*)
raise(WF_error("Whitespace is missing before `" ^ name ^ "'"));
| _ -> assert false
end;
if extend_dtd then begin
let new_el = new dtd_element dtd el_name in
(* Note that it is allowed that <!ATTLIST...> precedes the corresponding
* <!ELEMENT...> declaration. In this case we add the element declaration
* already to the DTD but leave the content model unspecified.
*)
let el =
try
dtd # add_element new_el;
new_el
with
Not_found -> (* already added *)
let old_el = dtd # element el_name in
if old_el # attribute_names <> [] then
config.warner # warn ("More than one ATTLIST declaration for element type `" ^
el_name ^ "'");
old_el
in
List.iter
(fun (a_name, a_type, a_default) ->
el # add_attribute a_name a_type a_default extdecl)
at_list
end
}}
? {{ match !yy_position with
"ws1" -> raise(WF_error("Whitespace is missing after ATTLIST"))
| "el_name" -> raise(WF_error("The name of the element is expected here"))
| "factor" -> raise(WF_error("Another attribute name or `>' expected"))
| _ -> raise(WF_error("Bad attribute declaration"))
}}
attdef_factor():
/* parses a list of triples <name> <type> <default value> and returns the
* list as (string * att_type * att_default) list.
*/
attdef:attdef() ws:Ignore? Ignore* factor:attdef_factor()
{{
let attdef_rest, decl_rangle_entid = factor in
if not ws && attdef_rest <> [] then begin
match attdef_rest with
(name,_,_) :: _ ->
raise(WF_error("Missing whitespace before `" ^ name ^ "'"));
| _ -> assert false
end;
(attdef :: attdef_rest), decl_rangle_entid }}
? {{ match !yy_position with
| "factor" -> raise(WF_error("Another attribute name or `>' expected"))
| _ -> raise(WF_error("Bad attribute declaration"))
}}
| entid:Decl_rangle
{{ [], entid }}
attdef():
/* Parses a single triple */
name: Name
ws1: Ignore Ignore*
tp: atttype()
ws2: Ignore Ignore*
default: defaultdecl()
{{ (name,tp,default) }}
? {{ match !yy_position with
("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
| "tp" -> raise(WF_error("Type of attribute or `(' expected"))
| "default" -> raise(WF_error("#REQUIRED, #IMPLIED, #FIXED or a string literal expected"))
| _ -> raise(WF_error("Bad attribute declaration"))
}}
atttype():
/* Parses an attribute type and returns it as att_type. */
name: Name
$ {{ let followup =
if name = "NOTATION" then
parse_notation
else
parse_never
in
}}
nota: [followup]()?
{{
match name with
"CDATA" -> A_cdata
| "ID" -> A_id
| "IDREF" -> A_idref
| "IDREFS" -> A_idrefs
| "ENTITY" -> A_entity
| "ENTITIES" -> A_entities
| "NMTOKEN" -> A_nmtoken
| "NMTOKENS" -> A_nmtokens
| "NOTATION" ->
(match nota with
None -> raise(WF_error("Error in NOTATION type (perhaps missing whitespace after NOTATION?)"))
| Some n -> n
)
| _ -> raise(WF_error("One of CDATA, ID, IDREF, IDREFS, ENTITY, ENTITIES, NMTOKEN, NMTOKENS, NOTATION, or a subexpression expected"))
}}
? {{ raise(WF_error("Bad attribute declaration (perhaps missing whitespace after NOTATION)")) }}
| Lparen
Ignore*
name: name_or_nametoken()
Ignore*
names: nmtoken_factor()*
rp: Rparen
/* Enumeration */
{{ A_enum(name :: names) }}
? {{ match !yy_position with
"name" -> raise(WF_error("Name expected"))
| "names" -> raise(WF_error("`|' and more names expected, or `)'"))
| "rp" -> raise(WF_error("`|' and more names expected, or `)'"))
| _ -> raise(WF_error("Bad enumeration type"))
}}
never():
/* The always failing rule */
$ {{ raise Not_found; }}
Doctype /* questionable */
{{ A_cdata (* Does not matter *)
}}
notation():
Ignore Ignore*
lp: Lparen
Ignore*
name: Name
Ignore*
names: notation_factor()*
rp: Rparen
{{ A_notation(name :: names) }}
? {{ match !yy_position with
"lp" -> raise(WF_error("`(' expected"))
| "name" -> raise(WF_error("Name expected"))
| "names" -> raise(WF_error("`|' and more names expected, or `)'"))
| "rp" -> raise(WF_error("`|' and more names expected, or `)'"))
| _ -> raise(WF_error("Bad NOTATION type"))
}}
notation_factor():
/* Parse "|<name>" and return the name */
Bar Ignore* name:Name Ignore*
{{ name }}
? {{ match !yy_position with
"name" -> raise(WF_error("Name expected"))
| _ -> raise(WF_error("Bad NOTATION type"))
}}
nmtoken_factor():
/* Parse "|<nmtoken>" and return the nmtoken */
Bar Ignore* n:name_or_nametoken() Ignore*
{{ n }}
? {{ match !yy_position with
"n" -> raise(WF_error("Nametoken expected"))
| _ -> raise(WF_error("Bad enumeration type"))
}}
name_or_nametoken():
n:Name {{ n }}
| n:Nametoken {{ n }}
/* The default values must be expanded and normalized. This has been implemented
* by the function expand_attvalue.
*/
defaultdecl():
/* Parse the default value for an attribute and return it as att_default */
Required
{{ D_required }}
| Implied
{{ D_implied }}
| Fixed ws:Ignore Ignore* str:Unparsed_string
{{ D_fixed
(expand_attvalue reused_lexbuf lexerset dtd str config.warner false) }}
? {{ match !yy_position with
"ws" -> raise(WF_error("Whitespace is missing after #FIXED"))
| "str" -> raise(WF_error("String literal expected"))
| _ -> raise(WF_error("Bad #FIXED default value"))
}}
| str:Unparsed_string
{{ D_default
(expand_attvalue reused_lexbuf lexerset dtd str config.warner false) }}
/**************************** ENTITY DECLARATION ***********************/
entitydecl(decl_entity_entid):
/* parses everything _after_ <!ENTITY until the matching >. The parsed
* entity declaration is entered into the dtd object as side-effect.
*/
name: Name
$ {{ let extdecl = context.manager # current_entity_counts_as_external in
}}
ws: Ignore Ignore*
material: entitydef()
Ignore*
decl_rangle_entid: Decl_rangle
/* A general entity */
{{
if decl_entity_entid != decl_rangle_entid then
raise (Validation_error "Entities not properly nested with ENTITY declaration");
let en =
(* Distinguish between
* - internal entities
* - external entities
* - NDATA (unparsed) entities
*)
match material with
(Some s, None, None) ->
new internal_entity dtd name config.warner s p_internal_subset
false config.encoding
| (None, Some xid, None) ->
new external_entity (resolver # clone) dtd name config.warner
xid false
config.encoding
| (None, Some xid, Some n) ->
(new ndata_entity name xid n config.encoding :> entity)
| _ -> assert false
in
dtd # add_gen_entity en extdecl
}}
? {{ match !yy_position with
"ws" -> raise(WF_error("Whitespace is missing"))
| "material" -> raise(WF_error("String literal or identifier expected"))
| "decl_rangle_entid" -> raise(WF_error("`>' expected"))
| _ -> raise(WF_error("Bad entity declaration"))
}}
| Percent
$ {{ let extdecl = context.manager # current_entity_counts_as_external in
}}
ws1: Ignore Ignore*
name: Name
ws2: Ignore Ignore*
material: pedef()
Ignore*
decl_rangle_entid: Decl_rangle
/* A parameter entity */
{{
if decl_entity_entid != decl_rangle_entid then
raise (Validation_error "Entities not properly nested with ENTITY declaration");
let en =
(* Distinguish between internal and external entities *)
match material with
(Some s, None) ->
new internal_entity dtd name config.warner s p_internal_subset
true config.encoding
| (None, Some xid) ->
new external_entity (resolver # clone) dtd name config.warner
xid true
config.encoding
| _ -> assert false
in
(* The following two lines force that even internal entities count
* as external (for the standalone check) if the declaration of
* the internal entity occurs in an external entity.
*)
if extdecl then
en # set_counts_as_external;
dtd # add_par_entity en;
}}
? {{ match !yy_position with
("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
| "material" -> raise(WF_error("String literal or identifier expected"))
| "decl_rangle_entid" -> raise(WF_error("`>' expected"))
| _ -> raise(WF_error("Bad entity declaration"))
}}
entitydef():
/* parses the definition value of a general entity. Returns either:
* - (Some s, None, None) meaning the definition of an internal entity
* with (literal) value s has been found
* - (None, Some x, None) meaning that an external parsed entity with
* external ID x has been found
* - (None, Some x, Some n) meaning that an unparsed entity with
* external ID x and notations n has been found
*/
str:Unparsed_string
{{ Some str, None, None }}
| id:external_id() ws:Ignore? Ignore* decl:ndatadecl()?
{{ if not ws && decl <> None then
raise(WF_error("Whitespace missing before `NDATA'"));
None, Some id, decl
}}
pedef():
/* parses the definition value of a parameter entity. Returns either:
* - (Some s, None) meaning that the definition of an internal entity
* with (literal) value s has been found
* - (None, Some x) meaning that an external ID x has been found
*/
str:Unparsed_string
{{ Some str, None }}
| id:external_id()
{{ None, Some id }}
ndatadecl():
/* Parses either NDATA "string" or the empty string; returns Some "string"
* in the former, None in the latter case.
*/
ndata:Name ws:Ignore Ignore* name:Name
{{ if ndata = "NDATA" then
name
else
raise(WF_error("NDATA expected"))
}}
? {{ match !yy_position with
"ws" -> raise(WF_error("Whitespace is missing after NDATA"))
| "name" -> raise(WF_error("Name expected"))
| _ -> raise(WF_error("Bad NDATA declaration"))
}}
/**************************** NOTATION DECLARATION *******************/
notationdecl():
/* parses <!NOTATION ... > and enters the notation declaration into the
* dtd object as side-effect
*/
decl_notation_entid: Decl_notation
ws1: Ignore Ignore*
name: Name
ws2: Ignore Ignore*
sys_or_public: Name /* SYSTEM or PUBLIC */
ws3: Ignore Ignore*
str1: Unparsed_string
ws: Ignore? Ignore*
str2: Unparsed_string?
Ignore*
decl_rangle_entid: Decl_rangle
{{
if decl_notation_entid != decl_rangle_entid then
raise (Validation_error "Entities not properly nested with NOTATION declaration");
let xid =
(* Note that it is allowed that PUBLIC is only followed by one
* string literal
*)
match sys_or_public with
"SYSTEM" ->
if str2 <> None then raise(WF_error("SYSTEM must be followed only by one argument"));
System (recode_utf8 str1)
| "PUBLIC" ->
begin match str2 with
None ->
check_public_id str1;
Public(recode_utf8 str1,"")
| Some p ->
if not ws then
raise(WF_error("Missing whitespace between the string literals of the `PUBLIC' id"));
check_public_id str1;
Public(recode_utf8 str1, recode_utf8 p)
end
| _ -> raise(WF_error("PUBLIC or SYSTEM expected"))
in
if extend_dtd then begin
let no = new dtd_notation name xid config.encoding in
dtd # add_notation no
end
}}
? {{ match !yy_position with
("ws1"|"ws2"|"ws3") -> raise(WF_error("Whitespace is missing"))
| "name" -> raise(WF_error("Name expected"))
| "sys_or_public" -> raise(WF_error("SYSTEM or PUBLIC expected"))
| ("str1"|"str2") -> raise(WF_error("String literal expected"))
| "decl_rangle_entid" -> raise(WF_error("`>' expected"))
| _ -> raise(WF_error("Bad NOTATION declaration"))
}}
/****************************** ELEMENTS **************************/
/* In the following rules, the number of error rules is reduced to
* improve the performance of the parser.
*/
contents_start():
/* parses <element>...</element> misc*, i.e. exactly one element followed
* optionally by white space or processing instructions.
* The element is entered into the global variables as follows:
* - If elstack is non-empty, the parsed element is added as new child to
* the top element of the stack.
* - If elstack is empty, the root_examplar object is modified rather than
* that a new element is created. If additionally the variable root is
* None, it is assigned Some root_examplar.
* Note that the modification of the root_exemplar is done by the method
* internal_init.
* The reason why the root element is modified rather than newly created
* is a typing requirement. It must be possible that the class of the root
* is derived from the original class element_impl, i.e. the user must be
* able to add additional methods. If we created a new root object, we
* would have to denote to which class the new object belongs; the root
* would always be an 'element_impl' object (and not a derived object).
* If we instead cloned an exemplar object and modified it by the
* "create" method, the root object would belong to the same class as the
* exemplar (good), but the type of the parsing function would always
* state that an 'element_impl' was created (because we can pass the new
* object only back via a global variable). The only solution is to
* modify the object that has been passed to the parsing function directly.
*/
$ {{ self # init_for_xml_body(); }}
start_tag() content()*
{{ () }}
content():
/* parses: start tags, end tags, content, or processing
* instructions. That the tags are properly nested is dynamically checked.
* As result, recognized elements are added to their parent elements,
* content is added to the element containing it, and processing instructions
* are entered into the element embracing them. (All as side-effects.)
*/
start_tag()
{{ () }}
| end_tag()
{{ () }}
| char_data()
{{ () }}
| cref()
{{ () }}
| pi()
{{ () }}
| entity_ref()
{{ () }}
| comment()
{{ () }}
entity_ref():
Begin_entity eref_xmldecl_then_rest()
{{ if n_tags_open = 0 then
raise(WF_error("Entity reference not allowed here"))
}}
/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */
eref_xmldecl_then_rest():
pl:PI_xml
$ {{ context.manager # current_entity # process_xmldecl pl;
}}
content()* End_entity
{{ () }}
| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
content() content()* End_entity
{{ () }}
| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
End_entity
{{ () }}
start_tag():
/* parses <element attribute-values> or <element attribute-values/>.
*
* EFFECT: If elstack is non-empty, the element is added to the
* top element of the stack as new child, and the element
* is pushed on the stack. If elstack is empty, the root_exemplar is
* modified and gets the parsed name and attribute list. The root_exemplar
* is pushed on the stack. If additionally the variable root is empty, too,
* this variable is initialized.
* If the <element ... /> form has been parsed, no element is pushed
* on the stack.
*/
/* TODO: Support for namespaces. xmlns attributes must be identified.
* These atts are not added to the regular attribute list, but the
* contained namespace information is interpreted.
* We need a stack tracking namespace declarations (PUSH for every
* start tag; POP for every end tag). Element names and attribute names
* are splitted into src_prefix and localname. The src_prefix is mapped
* to norm_prefix using the mentioned stack. The element and attributes
* are created with names "norm_prefix:localname".
* If configured, the namespace_info field of elements is set from
* the current contents of the namespace stack.
* New config options:
* - namespace_manager = None or Some mng
* - namespace_objects = true or false
*/
tag: Tag_beg
$ {{ let position =
if config.store_element_positions then
Some(context.manager # position)
else
None
in
let attlist = ref [] in
let attspace = ref true in
}}
ws: Ignore? Ignore*
attribute(attlist,attspace)*
emptiness: start_tag_rangle()
/* Note: it is guaranteed that there is whitespace between Tag_beg and
* the name of the first attribute, because there must be some separator.
* So we need not to check ws!
*/
{{
let name0, tag_beg_entid = tag in
let d =
(* The following "match" returns the new element node: *)
match config.enable_namespace_processing with
None ->
(* Simple case: no namespaces *)
let name =
if config.enable_name_pool_for_element_types
then make_pool_string name0
else name0 in
create_element_node
?name_pool_for_attribute_values:
(if config.enable_name_pool_for_attribute_values
then Some config.name_pool
else None)
?position:position
spec dtd name !attlist
| Some mng ->
(* If namespace processing is enabled, preprocess the attribute
* list:
*)
let splitted_attlist =
List.map
(fun (name, value) -> namespace_split name, value)
!attlist
(* TODO: Check that localname matches NCName *)
in
let xmlns_attlist = ref [] in
let xmlns_default = ref None in
let regular_attlist = ref [] in
List.iter
(fun ((prefix, localname), value) ->
if prefix = "" && localname = "xmlns" then
xmlns_default := Some value
else
if prefix = "xmlns" then
xmlns_attlist := (localname, value) :: !xmlns_attlist
else
regular_attlist := (prefix, localname, value) ::
!regular_attlist
)
splitted_attlist;
(* Apply xmlns_attlist: *)
src_norm_mapping <- ( "!", default_normprefix ) ::
src_norm_mapping; (* guard *)
List.iter
(fun (srcprefix, uri) ->
let normprefix =
mng # lookup_or_add_namespace srcprefix uri in
src_norm_mapping <- (srcprefix, normprefix) :: src_norm_mapping;
)
!xmlns_attlist;
(* Apply xmlns_default: *)
( match !xmlns_default with
None -> ()
| Some "" ->
(* Delete default namespace: *)
default_normprefix <- "";
| Some uri ->
let normprefix =
try mng # get_normprefix uri
with Not_found ->
mng # lookup_or_add_namespace "default" uri
in
default_normprefix <- normprefix
);
(* Normalize the regular_attlist: *)
let norm_attlist =
List.map
(fun (prefix, localname, value) ->
(self # normalize_namespace_prefix prefix localname,
value
)
)
!regular_attlist
in
(* Normalize the element name: *)
let prefix, localname = namespace_split name0 in
(* TODO: Check that localname matches NCName *)
let norm_name0 =
self # normalize_namespace_prefix
~apply_default:true prefix localname in
let norm_name =
if config.enable_name_pool_for_element_types
then make_pool_string norm_name0
else norm_name0 in
let element =
create_element_node
?name_pool_for_attribute_values:
(if config.enable_name_pool_for_attribute_values
then Some config.name_pool
else None)
?position:position
spec dtd norm_name norm_attlist
in
if config.enable_namespace_info then begin
let info =
new namespace_info_impl
prefix
element
( ("!", default_normprefix) :: src_norm_mapping) in
element # set_namespace_info (Some info);
end;
element
(* end of match *)
in
begin match id_index with
None -> ()
| Some idx ->
(* Put the ID attribute into the index, if present *)
begin try
let v = d # id_attribute_value in (* may raise Not_found *)
idx # add v d (* may raise ID_not_unique *)
with
Not_found ->
(* No ID attribute *)
()
| ID_not_unique ->
(* There is already an ID with the same value *)
raise(Validation_error("ID not unique"))
end
end;
if n_tags_open = 0 then begin
if root = None then begin
(* We have found the begin tag of the root element. *)
if config.enable_super_root_node then begin
(* The user wants the super root instead of the real root.
* The real root element becomes the child of the VR.
*)
(* Assertion: self # current is the super root *)
assert (self # current # node_type = T_super_root);
root <- Some (self # current);
self # current # append_node d;
doc # init_root (self # current) name0;
end
else begin
(* Normal behaviour: The user wants to get the real root. *)
root <- Some d;
doc # init_root d name0;
end;
end
else
(* We have found a second topmost element. This is illegal. *)
raise(WF_error("Document must consist of only one toplevel element"))
end
else begin
(* We have found some inner begin tag. *)
self # save_data; (* Save outstanding data material first *)
self # current # append_node d
end;
if emptiness then begin
(* An empty tag like <a/>. *)
if not config.disable_content_validation then
d # validate_contents ~use_dfa:config.validate_by_dfa ~check_data_nodes:false ();
if config.enable_namespace_processing <> None then
self # pop_src_norm_mapping()
end
else begin
(* A non-empty tag. *)
stack_push (d, name0, tag_beg_entid) elstack;
n_tags_open <- n_tags_open + 1;
end;
}}
? {{ match !yy_position with
| "emptiness" -> raise(WF_error("`>' or `/>' expected"))
| _ -> raise(WF_error("Bad start tag"))
}}
attribute(attlist,attspace):
/* Parses name="value" */
n0:Name Ignore* Eq Ignore* v:attval() ws:Ignore? Ignore*
{{ if not !attspace then
raise(WF_error("Whitespace is missing before attribute `" ^
n0 ^ "'"));
let n =
if config.enable_name_pool_for_attribute_names
then make_pool_string n0
else n0 in
attlist := (n,v) :: !attlist;
attspace := ws;
()
}}
attval():
v:Attval
{{ expand_attvalue reused_lexbuf lexerset dtd v config.warner true }}
| v:Attval_nl_normalized
{{ expand_attvalue reused_lexbuf lexerset dtd v config.warner false }}
start_tag_rangle():
Rangle {{ false }}
| Rangle_empty {{ true }}
end_tag():
/* parses </element>.
* Pops the top element from the elstack and checks if it is the same
* element.
*/
tag:Tag_end Ignore* Rangle
{{ let name, tag_end_entid = tag in
if n_tags_open = 0 then
raise(WF_error("End-tag without start-tag"));
self # save_data; (* Save outstanding data material first *)
let x, x_name, tag_beg_entid = stack_pop elstack in
if config.enable_namespace_processing <> None then
self # pop_src_norm_mapping();
if name <> x_name then begin
let x_entname, x_line, x_col = x # position in
raise(WF_error("End tag `" ^ name ^
"' does not match start tag `" ^ x_name ^ "'" ^
(if x_line = 0 then "" else
" (was at line " ^ string_of_int x_line ^
", position " ^ string_of_int x_col ^ ")" )));
end;
if tag_beg_entid != tag_end_entid then
raise(WF_error("End tag `" ^ name ^
"' not in the same entity as the start tag `" ^
x_name ^ "'"));
if not config.disable_content_validation then
x # validate_contents ~use_dfa:config.validate_by_dfa ~check_data_nodes:false ();
n_tags_open <- n_tags_open - 1;
assert (n_tags_open >= 0);
}}
char_data():
/* Parses any literal characters not otherwise matching, and adds the
* characters to the top element of elstack.
* If elstack is empty, it is assumed that there is no surrounding
* element, and any non-white space character is forbidden.
*/
data:CharData
{{
if n_tags_open = 0 then
(* only white space is allowed *)
self # only_whitespace data
else
self # collect_data data
(* We collect the chardata material until the next end tag is
* reached. Then the collected material will concatenated and
* stored as a single T_data node (see end_tag rule above)
* using save_data.
*)
}}
| data:Cdata
{{
if n_tags_open = 0 then
raise (WF_error("CDATA section not allowed here"));
self # collect_data data
(* Also collect CDATA material *)
}}
cref():
/* Parses &#...; and adds the character to the top element of elstack. */
code:CRef
{{
if n_tags_open = 0 then
(* No surrounding element: character references are not allowed *)
raise(WF_error("Character reference not allowed here"));
self # collect_data (character config.encoding config.warner code)
(* Also collect character references *)
}}
pi():
/* Parses <?...?> (but not <?xml white-space ... ?>).
* If there is a top element in elstack, the processing instruction is added
* to this element.
*/
pi: PI
{{
let position =
if config.store_element_positions then
Some(context.manager # position)
else
None
in
let target0,value = pi in
let target =
if config.enable_name_pool_for_pinstr_targets
then make_pool_string target0
else target0 in
let pinstr = new proc_instruction target value config.encoding in
if n_tags_open = 0 && not config.enable_super_root_node
then
doc # add_pinstr pinstr
else begin
(* Special case: if processing instructions are processed inline,
* they are wrapped into T_pinstr nodes.
*)
if config.enable_pinstr_nodes then begin
if init_done then begin
self # save_data; (* Save outstanding data material first *)
let wrapper = create_pinstr_node
?position:position spec dtd pinstr in
self # current # append_node wrapper;
end
else self # add_early_pinstr_node position pinstr
end
else
(* Normal behaviour: Add the PI to the parent element. *)
if init_done then
self # current # add_pinstr pinstr
else
self # add_early_pinstr position pinstr
end
}}
comment():
/* Parses <!-- ... -->
*/
begin_entid:Comment_begin
$ {{
let position =
if config.enable_comment_nodes && config.store_element_positions then
Some(context.manager # position)
else
None
in
}}
mat: Comment_material*
end_entid: Comment_end
{{
if begin_entid != end_entid then
raise(Validation_error("The first and the last token of comments must be in the same entity"));
if config.enable_comment_nodes then begin
let comment_text = String.concat "" mat in
if init_done then begin
self # save_data; (* Save outstanding data material first *)
let wrapper = create_comment_node
?position:position spec dtd comment_text in
self # current # append_node wrapper;
end
else
self # add_early_comment position comment_text
end
}}
? {{ match !yy_position with
| "end_entid" -> raise(WF_error("`-->' expected"))
| _ -> raise(WF_error("Bad comment"))
}}
%%
(* The method "parse" continues here... *)
try
match start_symbol with
Ext_document ->
parse_ext_document context.current context.get_next
| Ext_declarations ->
parse_ext_declarations context.current context.get_next
| Ext_element ->
parse_ext_element context.current context.get_next
with
Not_found ->
raise Parsing.Parse_error
(*********** The method "parse" ends here *************)
(**********************************************************************)
(* Here ends the class definition: *)
end
;;
(**********************************************************************)
(*
class default_ext =
object(self)
val mutable node = (None : ('a extension node as 'a) option)
method clone = {< >}
method node =
match node with
None ->
assert false
| Some n -> n
method set_node n =
node <- Some n
end
;;
*)
class default_ext =
object(self : 'self)
method clone = self
method node =
(assert false : 'self node)
method set_node (n : 'self node) =
()
end
;;
let default_extension = new default_ext;;
let default_spec =
make_spec_from_mapping
~super_root_exemplar: (new super_root_impl default_extension)
~comment_exemplar: (new comment_impl default_extension)
~default_pinstr_exemplar: (new pinstr_impl default_extension)
~data_exemplar: (new data_impl default_extension)
~default_element_exemplar: (new element_impl default_extension)
~element_mapping: (Hashtbl.create 1)
()
;;
let default_namespace_spec =
make_spec_from_mapping
~super_root_exemplar: (new super_root_impl default_extension)
~comment_exemplar: (new comment_impl default_extension)
~default_pinstr_exemplar: (new pinstr_impl default_extension)
~data_exemplar: (new data_impl default_extension)
~default_element_exemplar: (new namespace_element_impl default_extension)
~element_mapping: (Hashtbl.create 1)
()
;;
let idref_pass id_index root =
let error t att value =
let name =
match t # node_type with
T_element name -> name
| _ -> assert false
in
let text =
"Attribute `" ^ att ^ "' of element `" ^ name ^
"' refers to unknown ID `" ^ value ^ "'" in
let pos_ent, pos_line, pos_col = t # position in
if pos_line = 0 then
raise(Validation_error text)
else
raise(At("In entity " ^ pos_ent ^ " at line " ^
string_of_int pos_line ^ ", position " ^ string_of_int pos_col ^
":\n",
Validation_error text))
in
let rec check_tree t =
let idref_atts = t # idref_attribute_names in
List.iter
(fun att ->
match t # attribute att with
Value s ->
begin try ignore(id_index # find s) with
Not_found ->
error t att s
end
| Valuelist l ->
List.iter
(fun s ->
try ignore(id_index # find s) with
Not_found ->
error t att s
)
l
| Implied_value -> ()
)
idref_atts;
List.iter check_tree (t # sub_nodes)
in
check_tree root
;;
exception Return_DTD of dtd;;
(* Used by extract_dtd_from_document_entity to jump out of the parser *)
let call_parser ~configuration:cfg
~source:src
~dtd
~extensible_dtd
~document:doc
~specification:spec
~process_xmldecl
~transform_dtd
~(id_index : 'ext #index option)
~use_document_entity
~entry
~init_lexer =
let w = cfg.warner in
let r, en =
match src with
Entity(m,r') -> r', m dtd
| ExtID(xid,r') -> r',
if use_document_entity then
new document_entity
r' dtd "[toplevel]" w xid
cfg.encoding
else
new external_entity
r' dtd "[toplevel]" w xid false
cfg.encoding
in
r # init_rep_encoding cfg.encoding;
r # init_warner w;
en # set_debugging_mode (cfg.debugging_mode);
let pobj =
new parser_object
doc
dtd
extensible_dtd
cfg
r
spec
process_xmldecl
transform_dtd
(id_index :> 'ext index option)
in
let mgr = new entity_manager en in
en # open_entity true init_lexer;
begin try
let context = make_context mgr in
pobj # parse context entry;
ignore(en # close_entity);
with
Return_DTD d ->
ignore(en # close_entity);
raise(Return_DTD d)
| Failure "Invalid UTF-8 stream" ->
(* raised by the wlex-generated lexers only: map to Malformed_code *)
ignore(en # close_entity);
r # close_all;
let pos = mgr # position_string in
raise (At(pos, Netconversion.Malformed_code))
| error ->
ignore(en # close_entity);
r # close_all;
let pos = mgr # position_string in
raise (At(pos, error))
end;
if cfg.idref_pass then begin
match id_index with
None -> ()
| Some idx ->
( match pobj # root with
None -> ()
| Some root ->
idref_pass idx root;
)
end;
pobj
let parse_dtd_entity cfg src =
(* Parse a DTD given as separate entity. *)
let dtd = new dtd cfg.warner cfg.encoding in
( match cfg.enable_namespace_processing with
Some mng -> dtd # set_namespace_manager mng
| None -> ()
);
let doc = new document cfg.warner cfg.encoding in
let pobj =
call_parser
~configuration:cfg
~source:src
~dtd:dtd
~extensible_dtd:true (* Extend the DTD by parsed declarations *)
~document:doc
~specification:default_spec
~process_xmldecl:false (* The XML declaration is ignored
* (except 'encoding')
*)
~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
~id_index: None
~use_document_entity:false
~entry:Ext_declarations (* Entry point of the grammar *)
~init_lexer:Declaration (* The initially used lexer *)
in
dtd # validate;
if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
dtd
;;
let parse_content_entity ?id_index cfg src dtd spec =
(* Parse an element given as separate entity *)
dtd # validate; (* ensure that the DTD is valid *)
if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
let doc = new document cfg.warner cfg.encoding in
let pobj =
call_parser
~configuration:cfg
~source:src
~dtd:dtd
~extensible_dtd:true (* Extend the DTD by parsed declarations *)
~document:doc
~specification:spec
~process_xmldecl:false (* The XML declaration is ignored
* (except 'encoding')
*)
~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
~id_index:(id_index :> 'ext index option)
~use_document_entity:false
~entry:Ext_element (* Entry point of the grammar *)
~init_lexer:Content (* The initially used lexer *)
in
match pobj # root with
Some r -> r
| None -> raise(WF_error("No root element"))
;;
let parse_wfcontent_entity cfg src spec =
let dtd = new dtd cfg.warner cfg.encoding in
(* Instead of dtd # allow_arbitrary, because the processing instruction
* survives marshalling:
*)
dtd # add_pinstr
(new proc_instruction
"pxp:dtd"
"optional-element-and-notation-declarations"
cfg.encoding);
( match cfg.enable_namespace_processing with
Some mng -> dtd # set_namespace_manager mng
| None -> ()
);
let doc = new document cfg.warner cfg.encoding in
let pobj =
call_parser
~configuration:cfg
~source:src
~dtd:dtd
~extensible_dtd:false (* Do not extend the DTD *)
~document:doc
~specification:spec
~process_xmldecl:false (* The XML declaration is ignored
* (except 'encoding')
*)
~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
~id_index:None
~use_document_entity:false
~entry:Ext_element (* Entry point of the grammar *)
~init_lexer:Content (* The initially used lexer *)
in
match pobj # root with
Some r -> r
| None -> raise(WF_error("No root element"))
;;
let iparse_document_entity ?(transform_dtd = (fun x -> x))
?id_index
cfg0 src spec p_wf =
(* Parse an element given as separate entity *)
(* p_wf: 'true' if in well-formedness mode, 'false' if in validating mode *)
let cfg = { cfg0 with
recognize_standalone_declaration =
cfg0.recognize_standalone_declaration && (not p_wf)
} in
let dtd = new dtd cfg.warner cfg.encoding in
if p_wf then begin
(* Instead of dtd # allow_arbitrary, because the processing instruction
* survives marshalling:
*)
dtd # add_pinstr
(new proc_instruction
"pxp:dtd"
"optional-element-and-notation-declarations"
cfg.encoding);
end;
( match cfg.enable_namespace_processing with
Some mng -> dtd # set_namespace_manager mng
| None -> ()
);
let doc = new document cfg.warner cfg.encoding in
let pobj =
call_parser
~configuration:cfg
~source:src
~dtd:dtd
~extensible_dtd:(not p_wf) (* Extend the DTD by parsed declarations
* only if in validating mode
*)
~document:doc
~specification:spec
~process_xmldecl:true (* The XML declaration is processed *)
(* TODO: change to 'not p_wf' ? *)
~transform_dtd:(fun dtd ->
let dtd' = transform_dtd dtd in
if cfg.accept_only_deterministic_models then
dtd' # only_deterministic_models;
dtd')
~id_index:(id_index :> 'ext index option)
~use_document_entity:true
~entry:Ext_document (* Entry point of the grammar *)
~init_lexer:Document (* The initially used lexer *)
in
pobj # doc
;;
let parse_document_entity ?(transform_dtd = (fun x -> x))
?id_index
cfg src spec =
iparse_document_entity
~transform_dtd:transform_dtd
?id_index:(id_index : 'ext #index option :> 'ext index option)
cfg src spec false;;
let parse_wfdocument_entity cfg src spec =
iparse_document_entity cfg src spec true;;
let extract_dtd_from_document_entity cfg src =
let transform_dtd dtd = raise (Return_DTD dtd) in
try
let doc = parse_document_entity
~transform_dtd:transform_dtd
cfg
src
default_spec in
(* Should not happen: *)
doc # dtd
with
Return_DTD dtd ->
(* The normal case: *)
dtd
;;
let default_config =
let w = new drop_warnings in
{ warner = w;
enable_pinstr_nodes = false;
enable_super_root_node = false;
enable_comment_nodes = false;
drop_ignorable_whitespace = true;
encoding = `Enc_iso88591;
recognize_standalone_declaration = true;
store_element_positions = true;
idref_pass = false;
validate_by_dfa = true;
accept_only_deterministic_models = true;
disable_content_validation = false;
name_pool = make_probabilistic_pool 10;
enable_name_pool_for_element_types = false;
enable_name_pool_for_attribute_names = false;
enable_name_pool_for_pinstr_targets = false;
enable_name_pool_for_attribute_values = false;
enable_namespace_processing = None;
enable_namespace_info = false;
debugging_mode = false;
}
let default_namespace_config =
{ default_config with
enable_namespace_processing = Some (new namespace_manager)
}
class [ 'ext ] hash_index =
object
constraint 'ext = 'ext node #extension
val ht = (Hashtbl.create 100 : (string, 'ext node) Hashtbl.t)
method add s n =
try
ignore(Hashtbl.find ht s);
raise ID_not_unique
with
Not_found ->
Hashtbl.add ht s n
method find s = Hashtbl.find ht s
method index = ht
end
(* ======================================================================
* History:
*
* $Log: pxp_yacc.m2y,v $
* Revision 1.36 2002/03/10 23:40:52 gerd
* type source is now primarily defined in Pxp_dtd.
*
* Revision 1.35 2002/02/20 00:25:23 gerd
* using Pxp_lexing instead of Lexing.
*
* Revision 1.34 2002/02/18 00:26:14 gerd
* Small optimization in method save_data.
*
* Revision 1.33 2001/10/12 21:38:14 gerd
* Changes for O'caml 3.03-alpha.
*
* Revision 1.32 2001/07/04 21:55:52 gerd
* Bugfix: Early comments and processing instructions (i.e.
* if they occur before the first element) are now handled correctly.
*
* Revision 1.31 2001/06/30 00:05:12 gerd
* Fix: When checking the type of the root element, namespace
* rewritings are taken into account.
*
* Revision 1.30 2001/06/29 14:44:35 gerd
* Fixed: ~transform_dtd works now if enable_super_root
*
* Revision 1.29 2001/06/28 22:42:07 gerd
* Fixed minor problems:
* - Comments must be contained in one entity
* - Pxp_document.document is now initialized with encoding.
* the DTD encoding may be initialized too late.
*
* Revision 1.28 2001/06/09 22:32:24 gerd
* Fixed the way set_namespace_info is called.
*
* Revision 1.27 2001/06/08 01:15:47 gerd
* Moved namespace_manager from Pxp_document to Pxp_dtd. This
* makes it possible that the DTD can recognize the processing instructions
* <?pxp:dtd namespace prefix="..." uri="..."?>, and add the namespace
* declaration to the manager.
*
* Revision 1.26 2001/06/07 22:55:14 gerd
* Uses methods classify_data_node, append_node, validate_contents
* now provided by nodes.
*
* Revision 1.25 2001/05/17 22:39:10 gerd
* Fix: default_spec
*
* Revision 1.24 2001/05/17 21:39:31 gerd
* Initial implementation of namespace parsing.
*
* Revision 1.23 2001/04/27 00:00:14 gerd
* Added a comment what to do to implement namespaces. See
* the rule start_tag.
*
* Revision 1.22 2001/04/24 21:07:13 gerd
* New option ~alt in from_channel and from_file.
*
* Revision 1.21 2001/04/22 15:15:40 gerd
* Improved error messages.
*
* Revision 1.20 2001/04/22 14:17:35 gerd
* from_channel uses now standard features of Pxp_reader, and
* is no longer a hack.
*
* Revision 1.19 2001/04/03 20:22:44 gerd
* New resolvers for catalogs of PUBLIC and SYSTEM IDs.
* Improved "combine": PUBLIC and SYSTEM IDs are handled
* separately.
* Rewritten from_file: Is now a simple application of the
* Pxp_reader classes and functions. (The same has still to be done
* for from_channel!)
*
* Revision 1.18 2000/10/01 19:49:04 gerd
* Many small optimizations, espcially in attribute parsing.
* New type array_stack.
*
* Revision 1.17 2000/09/21 21:30:46 gerd
* New option: disable_content_validation
*
* Revision 1.16 2000/09/16 22:48:23 gerd
* Failure "Invalid UTF-8 stream" which may raised by wlex-
* generated code is converted to Malformed_code.
* Instead of dtd#allow_arbitrary the corresponding
* processing instruction is added to the DTD. Advantage: When
* marshalled, this property will not be lost.
*
* Revision 1.15 2000/09/09 16:41:03 gerd
* Effort to reduce the amount of allocated memory: The number of
* instance variables in document nodes has been miminized; the class
* default_ext no longer stores anything; string pools have been implemented.
*
* Revision 1.14 2000/08/26 23:23:14 gerd
* Bug: from_file must not interpret the file name as URL path.
* Bug: When PI and comment nodes are generated, the collected data
* material must be saved first.
*
* Revision 1.13 2000/08/19 21:30:03 gerd
* Improved the error messages of the parser
*
* Revision 1.12 2000/08/18 20:16:25 gerd
* Implemented that Super root nodes, pinstr nodes and comment
* nodes are included into the document tree.
*
* Revision 1.11 2000/08/14 22:24:55 gerd
* Moved the module Pxp_encoding to the netstring package under
* the new name Netconversion.
*
* Revision 1.10 2000/07/23 02:16:33 gerd
* Support for DFAs.
*
* Revision 1.9 2000/07/14 13:57:29 gerd
* Added the id_index feature.
*
* Revision 1.8 2000/07/09 17:52:45 gerd
* New implementation for current_data.
* The position of elements is stored on demand.
*
* Revision 1.7 2000/07/09 01:00:35 gerd
* Improvement: It is now guaranteed that only one data node
* is added for consecutive character material.
*
* Revision 1.6 2000/07/08 16:27:29 gerd
* Cleaned up the functions calling the parser.
* New parser argument: transform_dtd.
* Implementations for 'extract_dtd_from_document_entity' and
* 'parse_wfcontent_entity'.
*
* Revision 1.5 2000/07/06 23:05:18 gerd
* Initializations of resolvers were missing.
*
* Revision 1.4 2000/07/06 22:11:01 gerd
* Fix: The creation of the non-virtual root element is protected
* in the same way as the virtual root element.
*
* Revision 1.3 2000/07/04 22:15:18 gerd
* Change: Using the new resolver capabilities.
* Still incomplete: the new extraction and parsing functions.
*
* Revision 1.2 2000/06/14 22:19:06 gerd
* Added checks such that it is impossible to mix encodings.
*
* Revision 1.1 2000/05/29 23:48:38 gerd
* Changed module names:
* Markup_aux into Pxp_aux
* Markup_codewriter into Pxp_codewriter
* Markup_document into Pxp_document
* Markup_dtd into Pxp_dtd
* Markup_entity into Pxp_entity
* Markup_lexer_types into Pxp_lexer_types
* Markup_reader into Pxp_reader
* Markup_types into Pxp_types
* Markup_yacc into Pxp_yacc
* See directory "compatibility" for (almost) compatible wrappers emulating
* Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
*
* ======================================================================
* Old logs from markup_yacc.m2y:
*
* Revision 1.9 2000/05/29 21:14:57 gerd
* Changed the type 'encoding' into a polymorphic variant.
*
* Revision 1.8 2000/05/27 19:26:19 gerd
* Change: The XML declaration is interpreted right after
* it has been parsed (no longer after the document): new function
* check_and_parse_xmldecl.
* When elements, attributes, and entities are declared
* it is stored whether the declaration happens in an external
* entity (for the standalone check).
* The option recognize_standalone_declaration is interpreted.
*
* Revision 1.7 2000/05/20 20:31:40 gerd
* Big change: Added support for various encodings of the
* internal representation.
*
* Revision 1.6 2000/05/14 21:51:24 gerd
* Change: Whitespace is handled by the grammar, and no longer
* by the entity.
*
* Revision 1.5 2000/05/14 17:50:54 gerd
* Updates because of changes in the token type.
*
* Revision 1.4 2000/05/11 22:09:17 gerd
* Fixed the remaining problems with conditional sections.
* This seems to be also a weakness of the XML spec!
*
* Revision 1.3 2000/05/09 00:02:44 gerd
* Conditional sections are now recognized by the parser.
* There seem some open questions; see the TODO comments!
*
* Revision 1.2 2000/05/08 22:01:44 gerd
* Introduced entity managers (see markup_entity.ml).
* The XML declaration is now recognized by the parser. If such
* a declaration is found, the method process_xmldecl of the currently
* active entity is called. If the first token is not an XML declaration,
* the method process_missing_xmldecl is called instead.
* Some minor changes.
*
* Revision 1.1 2000/05/06 23:21:49 gerd
* Initial revision.
*
*
* ======================================================================
*
* COPIED FROM REVISION 1.19 OF markup_yacc.mly
*
* Revision 1.19 2000/05/01 15:20:08 gerd
* "End tag matches start tag" is checked before "End tag in the
* same entity as start tag".
*
* Revision 1.18 2000/04/30 18:23:08 gerd
* Bigger change: Introduced the concept of virtual roots. First,
* this reduces the number of checks. Second, it makes it possible to
* return the virtual root to the caller instead of the real root (new
* config options 'virtual_root' and 'processing_instructions_inline').
* Minor changes because of better CR/CRLF handling.
*
* Revision 1.17 2000/03/13 23:47:46 gerd
* Updated because of interface changes. (See markup_yacc_shadow.mli
* rev. 1.8)
*
* Revision 1.16 2000/01/20 20:54:43 gerd
* New config.errors_with_line_numbers.
*
* Revision 1.15 1999/12/17 22:27:58 gerd
* Bugfix: The value of 'p_internal_subset' (an instance
* variable of the parser object) is to true when the internal subset
* begins, and is set to false when this subset ends. The error was
* that references to external entities within this subset did not
* set 'p_internal_subset' to false; this is now corrected by introducing
* the 'p_internal_subset_stack'.
* This is a typical example of how the code gets more and
* more complicated and that it is very difficult to really understand
* what is going on.
*
* Revision 1.14 1999/11/09 22:23:37 gerd
* Removed the invocation of "init_dtd" of the root document.
* This method is no longer available. The DTD is also passed to the
* document object by the root element, so nothing essential changes.
*
* Revision 1.13 1999/10/25 23:37:09 gerd
* Bugfix: The warning "More than one ATTLIST declaration for element
* type ..." is only generated if an ATTLIST is found while there are already
* attributes for the element.
*
* Revision 1.12 1999/09/01 23:08:38 gerd
* New frontend function: parse_wf_document. This simply uses
* a DTD that allows anything, and by the new parameter "extend_dtd" it is
* avoided that element, attlist, and notation declarations are added to this
* DTD. The idea is that this function simulates a well-formedness parser.
* Tag_beg, Tag_end carry the entity_id. The "elstack" stores the
* entity_id of the stacked tag. This was necessary because otherwise there
* are some examples to produces incorrectly nested elements.
* p_internal_subset is a variable that stores whether the internal
* subset is being parsed. This is important beacause entity declarations in
* internal subsets are not allowed to contain parameter references.
* It is checked if the "elstack" is empty after all has been parsed.
* Processing instructions outside DTDs and outside elements are now
* added to the document.
* The rules of mixed and regexp style content models have been
* separated. The code is now much simpler.
* Entity references outside elements are detected and rejected.
*
* Revision 1.11 1999/09/01 16:26:08 gerd
* Improved the quality of error messages.
*
* Revision 1.10 1999/08/31 19:13:31 gerd
* Added checks on proper PE nesting. The idea is that tokens such
* as Decl_element and Decl_rangle carry an entity ID with them. This ID
* is simply an object of type < >, i.e. you can only test on identity.
* The lexer always produces tokens with a dummy ID because it does not
* know which entity is the current one. The entity layer replaces the dummy
* ID with the actual ID. The parser checks that the IDs of pairs such as
* Decl_element and Decl_rangle are the same; otherwise a Validation_error
* is produced.
*
* Revision 1.9 1999/08/15 20:42:01 gerd
* Corrected a misleading message.
*
* Revision 1.8 1999/08/15 20:37:34 gerd
* Improved error messages.
* Bugfix: While parsing document entities, the subclass document_entity is
* now used instead of external_entity. The rules in document entities are a bit
* stronger.
*
* Revision 1.7 1999/08/15 14:03:59 gerd
* Empty documents are not allowed.
* "CDATA section not allowed here" is a WF_error, not a Validation_
* error.
*
* Revision 1.6 1999/08/15 02:24:19 gerd
* Removed some grammar rules that were used for testing.
* Documents without DTD can now have arbitrary elements (formerly
* they were not allowed to have any element).
*
* Revision 1.5 1999/08/14 22:57:20 gerd
* It is allowed that external entities are empty because the
* empty string is well-parsed for both declarations and contents. Empty
* entities can be referenced anywhere because the references are replaced
* by nothing. Because of this, the Begin_entity...End_entity brace is only
* inserted if the entity is non-empty. (Otherwise references to empty
* entities would not be allowed anywhere.)
* As a consequence, the grammar has been changed such that a
* single Eof is equivalent to Begin_entity,End_entity without content.
*
* Revision 1.4 1999/08/14 22:20:01 gerd
* The "config" slot has now a component "warner" which is
* an object with a "warn" method. This is used to warn about characters
* that cannot be represented in the Latin 1 alphabet.
* Furthermore, there is a new component "debugging_mode".
* Some Parse_error exceptions have been changed into Validation_error.
* The interfaces of functions/classes imported from other modules
* have changed; the invocations have been adapted.
* Contents may contain CDATA sections that have been forgotten.
*
* Revision 1.3 1999/08/11 15:00:41 gerd
* The Begin_entity ... End_entity brace is also possible in
* 'contents'.
* The configuration passed to the parsing object contains always
* the resolver that is actually used.
*
* Revision 1.2 1999/08/10 21:35:12 gerd
* The XML/encoding declaration at the beginning of entities is
* evaluated. In particular, entities have now a method "xml_declaration"
* which returns the name/value pairs of such a declaration. The "encoding"
* setting is interpreted by the entity itself; "version", and "standalone"
* are interpreted by Markup_yacc.parse_document_entity. Other settings
* are ignored (this does not conform to the standard; the standard prescribes
* that "version" MUST be given in the declaration of document; "standalone"
* and "encoding" CAN be declared; no other settings are allowed).
* TODO: The user should be warned if the standard is not exactly
* fulfilled. -- The "standalone" property is not checked yet.
*
* Revision 1.1 1999/08/10 00:35:52 gerd
* Initial revision.
*
*
*)