(* $Id: pxp_entity.ml,v 1.16 2002/03/10 23:39:45 gerd Exp $
* ----------------------------------------------------------------------
* PXP: The polymorphic XML parser for Objective Caml.
* Copyright by Gerd Stolpmann. See LICENSE for details.
*)
open Pxp_types
open Pxp_lexer_types
open Pxp_aux
open Pxp_reader
(* Hierarchy of parsing layers:
*
* - Parser: Pxp_yacc
* + gets input stream from the main entity object
* + checks most of the grammar
* + creates the DTD object as side-effect
* + creates the element tree as side-effect
* + creates further entity objects that are entered into the DTD
* - Entity layer: Pxp_entity
* + gets input stream from the lexers, or another entity object
* + handles entity references: if a reference is encountered the
* input stream is redirected such that the tokens come from the
* referenced entity object
* + handles conditional sections
* - Lexer layer: Pxp_lexers
* + gets input from lexbuffers created by resolvers
* + different lexers for different lexical contexts
* + a lexer returns pairs (token,lexid), where token is the scanned
* token, and lexid is the name of the lexer that must be used for
* the next token
* - Resolver layer: Pxp_entity
* + a resolver creates the lexbuf from some character source
* + a resolver recodes the input and handles the encoding scheme
*)
(**********************************************************************)
(* Variables of type 'state' are used to insert Begin_entity and End_entity
* tokens into the stream.
* - At_beginning: Nothing has been read so far
* - First_token tok: A Begin_entity has been inserted; and the next token
* is 'tok' which is not Eof. (Begin_entity/End_entity must not be inserted
* if the entity is empty.)
* - In_stream: After the first token has been read, but befor Eof.
* - At_end: Eof has been read, and End_entity has been returned.
*)
type state =
At_beginning
| Inserted_begin_entity
| At_end
;;
(**********************************************************************)
class type ['entity] preliminary_dtd =
object
method standalone_declaration : bool
method gen_entity : string -> ('entity * bool)
method par_entity : string -> 'entity
end
;;
(* Instead of many instance variables we have many record components of
* one instance variable. Speeds the entity methods up.
*)
type 'entity entity_variables =
{ mutable dtd : 'entity preliminary_dtd;
mutable name : string;
mutable warner : collect_warnings;
mutable encoding : rep_encoding;
mutable lexerset : lexer_set;
mutable lexerset_scan_document : Lexing.lexbuf -> (token * lexers);
mutable lexerset_scan_document_type : Lexing.lexbuf -> (token * lexers);
mutable lexerset_scan_content : Lexing.lexbuf -> (token * lexers);
mutable lexerset_scan_within_tag : Lexing.lexbuf -> (token * lexers);
mutable lexerset_scan_declaration : Lexing.lexbuf -> (token * lexers);
mutable lexerset_scan_decl_comment : Lexing.lexbuf -> (token * lexers);
mutable lexerset_scan_content_comment : Lexing.lexbuf -> (token * lexers);
mutable lexerset_scan_document_comment : Lexing.lexbuf -> (token * lexers);
mutable lexbuf : Lexing.lexbuf;
(* The lexical buffer currently used as character source. *)
mutable prolog : prolog_token list option;
(* Stores the initial <?xml ...?> token as PI_xml *)
mutable prolog_pairs : (string * string) list;
(* If prolog <> None, these are the (name,value) pairs of the
* processing instruction.
*)
mutable lex_id : lexers;
(* The name of the lexer that should be used for the next token *)
mutable force_parameter_entity_parsing : bool;
(* 'true' forces that inner entities will always be embraced by
* Begin_entity and End_entity.
* 'false': the inner entity itself decides this
*)
mutable check_text_declaration : bool;
(* 'true': It is checked that the <?xml..?> declaration matches the
* production TextDecl.
*)
mutable normalize_newline : bool;
(* Whether this entity converts CRLF or CR to LF, or not *)
mutable line : int; (* current line *)
mutable column : int; (* current column *)
mutable p_line : int; (* previous line *)
mutable p_column : int; (* previous column *)
mutable linecount : linecount; (* aux component for line counting *)
mutable counts_as_external : bool;
(* Whether the entity counts as external (for the standalone check). *)
mutable at_bof : bool;
mutable deferred_token : token list option;
(* If you set this to Some tl, the next invocations of
* next_token_from_entity will return the tokens in tl.
* This makes it possible to insert tokens into the stream.
*)
mutable debug : bool;
}
;;
let make_variables the_dtd the_name the_warner init_encoding =
let ls = Pxp_lexers.get_lexer_set init_encoding in
{ dtd = (the_dtd : 'entity #preliminary_dtd :> 'entity preliminary_dtd);
name = the_name;
warner = the_warner;
encoding = init_encoding;
lexerset = ls;
lexerset_scan_document = ls.scan_document;
lexerset_scan_document_type = ls.scan_document_type;
lexerset_scan_content = ls.scan_content;
lexerset_scan_within_tag = ls.scan_within_tag;
lexerset_scan_declaration = ls.scan_declaration;
lexerset_scan_decl_comment = ls.scan_decl_comment;
lexerset_scan_content_comment = ls.scan_content_comment;
lexerset_scan_document_comment = ls.scan_document_comment;
lexbuf = Pxp_lexing.from_string "";
prolog = None;
prolog_pairs = [];
lex_id = Document;
force_parameter_entity_parsing = false;
check_text_declaration = true;
normalize_newline = true;
line = 1;
column = 0;
p_line = 1;
p_column = 1;
linecount = { lines = 0; columns = 0 };
counts_as_external = false;
at_bof = true;
deferred_token = None;
debug = false;
}
;;
let update_lines v =
let n_lines = v.linecount.lines in
let n_columns = v.linecount.columns in
v.line <- v.line + n_lines;
v.column <- if n_lines = 0 then v.column + n_columns else n_columns
;;
let update_content_lines v tok =
match tok with
LineEnd _ ->
v.line <- v.line + 1;
v.column <- 0;
| (PI(_,_)|PI_xml _|Cdata _) ->
count_lines v.linecount (Lexing.lexeme v.lexbuf);
update_lines v;
| _ ->
v.column <- v.column + Lexing.lexeme_end v.lexbuf
- Lexing.lexeme_start v.lexbuf
;;
let update_lines_within_tag v tok =
match tok with
Attval av ->
(* count av + delimiting quotes *)
count_lines v.linecount av;
if v.linecount.lines = 0 then
v.column <- v.column + v.linecount.columns + 2
else begin
update_lines v;
v.column <- v.column + 1;
end
| IgnoreLineEnd ->
v.line <- v.line + 1;
v.column <- 0;
| _ ->
v.column <- v.column + Lexing.lexeme_end v.lexbuf
- Lexing.lexeme_start v.lexbuf
;;
let update_other_lines v tok =
count_lines v.linecount (Lexing.lexeme v.lexbuf);
update_lines v;
;;
class virtual entity the_dtd the_name the_warner init_encoding =
object (self)
(* This class prescribes the type of all entity objects. Furthermore,
* the default 'next_token' mechanism is implemented.
*)
val v = make_variables the_dtd the_name the_warner init_encoding
method is_ndata = false
(* Returns if this entity is an NDATA (unparsed) entity *)
method name = v.name
method set_lex_id id = v.lex_id <- id
method line = v.p_line
method column = v.p_column
method encoding = v.encoding
(* method lexerset = lexerset *)
val mutable manager = None
(* The current entity_manager, see below *)
method private manager =
( match manager with
None -> assert false
| Some m -> m
: < current_entity : entity;
pop_entity : unit;
push_entity : entity -> unit >
)
method set_manager m = manager <- Some m
method counts_as_external = v.counts_as_external
method set_counts_as_external =
v.counts_as_external <- true
method virtual open_entity : bool -> lexers -> unit
(* open_entity force_parsing lexid:
* opens the entity, and the first token is scanned by the lexer
* 'lexid'. 'force_parsing' forces that Begin_entity and End_entity
* tokens embrace the inner tokens of the entity; otherwise this
* depends on the entity.
* By opening an entity, reading tokens from it, and finally closing
* the entity, the inclusion methods "Included",
* "Included if validating", and "Included as PE" can be carried out.
* Which method is chosen depends on the 'lexid', i.e. the lexical
* context: 'lexid = Content' performs "Included (if validating)" (we
* are always validating); 'lexid = Declaration' performs
* "Included as PE". The difference is which tokens are recognized,
* and how spaces are handled.
* 'force_parsing' causes that a Begin_entity token is inserted before
* and an End_entity token is inserted after the entity. The yacc
* rules allow the Begin_entity ... End_entity brace only at certain
* positions; this is used to restrict the possible positions where
* entities may be included, and to guarantee that the entity matches
* a certain production of the grammar ("parsed entities").
* 'open_entity' is currently invoked with 'force_parsing = true'
* for toplevel nodes, for inclusion of internal general entities,
* and for inclusion of parameter entities into document entities.
* 'force_parsing = false' is used for all other cases: External
* entities add the Begin_entity/End_entity tokens anyway; internal
* entities do not. Especially internal parameter entities referenced
* from non-document entities do not add these tokens.
*)
method virtual close_entity : lexers
(* close_entity:
* closes the entity and returns the name of the lexer that must
* be used to scan the next token.
*)
method virtual replacement_text : (string * bool)
(* replacement_text:
* returns the replacement text of the entity, and as second value,
* whether the replacement text was constructed by referencing
* external entities (directly or indirectly).
* This method implements the inclusion method "Included in Literal".
*)
method lexbuf = v.lexbuf
method xml_declaration =
(* return the (name,value) pairs of the initial <?xml name=value ...?>
* processing instruction.
*)
match v.prolog with
None ->
None
| Some p ->
Some v.prolog_pairs
method set_debugging_mode m =
v.debug <- m
method private virtual set_encoding : string -> unit
method full_name =
v.name
method next_token =
(* read next token from this entity *)
let v = v in (* Lookup the instance variable only once *)
let debug = v.debug in
match v.deferred_token with
Some toklist ->
( match toklist with
[] ->
v.deferred_token <- None;
self # next_token
| tok :: toklist' ->
v.deferred_token <- Some toklist';
if debug then
prerr_endline ("- Entity " ^ v.name ^ ": " ^ string_of_tok tok ^ " (deferred)");
tok
)
| None -> begin
v.p_line <- v.line;
v.p_column <- v.column;
(* Read the next token from the appropriate lexer lex_id, and get the
* name lex_id' of the next lexer to be used.
*)
let update_fn = ref update_content_lines in
let scan_fn =
match v.lex_id with
Document -> update_fn := update_other_lines;
v.lexerset_scan_document
| Document_type -> update_fn := update_other_lines;
v.lexerset_scan_document_type
| Content -> v.lexerset_scan_content
| Within_tag -> update_fn := update_lines_within_tag;
v.lexerset_scan_within_tag
| Declaration -> update_fn := update_other_lines;
v.lexerset_scan_declaration
| Content_comment -> update_fn := update_other_lines;
v.lexerset_scan_content_comment
| Decl_comment -> update_fn := update_other_lines;
v.lexerset_scan_decl_comment
| Document_comment -> update_fn := update_other_lines;
v.lexerset_scan_document_comment
| Ignored_section -> assert false
(* Ignored_section: only used by method next_ignored_token *)
in
let tok, lex_id' = scan_fn v.lexbuf in
(* Find out the number of lines and characters of the last line: *)
!update_fn v tok;
v.lex_id <- lex_id';
if debug then
prerr_endline ("- Entity " ^ v.name ^ ": " ^ string_of_tok tok);
(* Throw Ignore and Comment away; Interpret entity references: *)
(* NOTE: Of course, references to general entities are not allowed
* everywhere; parameter references, too. This is already done by the
* lexers, i.e. &name; and %name; are recognized only where they
* are allowed.
*)
let tok' =
match tok with
(* Entity references: *)
| ERef n ->
let en, extdecl = v.dtd # gen_entity n in
if v.dtd # standalone_declaration && extdecl then
raise
(Validation_error
("Reference to entity `" ^ n ^
"' violates standalone declaration"));
en # set_debugging_mode debug;
en # open_entity true v.lex_id;
self # manager # push_entity en;
en # next_token;
| PERef n ->
let en = v.dtd # par_entity n in
en # set_debugging_mode debug;
en # open_entity v.force_parameter_entity_parsing v.lex_id;
self # manager # push_entity en;
en # next_token;
(* Convert LineEnd to CharData *)
| LineEnd s ->
if v.normalize_newline then
CharData "\n"
else
CharData s
(* Also normalize CDATA sections *)
| Cdata value as cd ->
if v.normalize_newline then
Cdata(normalize_line_separators v.lexerset value)
else
cd
(* If there are CRLF sequences in a PI value, normalize them, too *)
| PI(name,value) as pi ->
if v.normalize_newline then
PI(name, normalize_line_separators v.lexerset value)
else
pi
(* Attribute values: If they are already normalized, they are turned
* into Attval_nl_normalized. This is detected by other code.
*)
| Attval value as av ->
if v.normalize_newline then
av
else
Attval_nl_normalized value
(* Another CRLF normalization case: Unparsed_string *)
| Unparsed_string value as ustr ->
if v.normalize_newline then
Unparsed_string(normalize_line_separators v.lexerset value)
else
ustr
(* Turn IgnoreLineEnd into Ignore *)
| IgnoreLineEnd -> Ignore
(* These tokens require that the entity_id parameter is set: *)
| Comment_begin _ -> Comment_begin(self :> entity_id)
| Comment_end _ -> Comment_end (self :> entity_id)
| Doctype _ -> Doctype (self :> entity_id)
| Doctype_rangle _ ->Doctype_rangle(self :> entity_id)
| Dtd_begin _ -> Dtd_begin (self :> entity_id)
| Dtd_end _ -> Dtd_end (self :> entity_id)
| Decl_element _ -> Decl_element (self :> entity_id)
| Decl_attlist _ -> Decl_attlist (self :> entity_id)
| Decl_entity _ -> Decl_entity (self :> entity_id)
| Decl_notation _ ->Decl_notation (self :> entity_id)
| Decl_rangle _ -> Decl_rangle (self :> entity_id)
| Lparen _ -> Lparen (self :> entity_id)
| Rparen _ -> Rparen (self :> entity_id)
| RparenPlus _ -> RparenPlus (self :> entity_id)
| RparenStar _ -> RparenStar (self :> entity_id)
| RparenQmark _ -> RparenQmark (self :> entity_id)
| Conditional_begin _ -> Conditional_begin (self :> entity_id)
| Conditional_body _ -> Conditional_body (self :> entity_id)
| Conditional_end _ -> Conditional_end (self :> entity_id)
| Tag_beg (n,_) -> Tag_beg (n, (self :> entity_id))
| Tag_end (n,_) -> Tag_end (n, (self :> entity_id))
(* End of file: *)
| Eof ->
if debug then begin
prerr_endline ("- Entity " ^ v.name ^ " # handle_eof");
let tok = self # handle_eof in
prerr_endline ("- Entity " ^ v.name ^ " # handle_eof: returns " ^ string_of_tok tok);
tok
end
else
self # handle_eof;
(* The default case. *)
| _ ->
tok
in
if v.at_bof then begin
v.at_bof <- false;
if tok <> Eof then begin
if debug then
prerr_endline ("- Entity " ^ v.name ^ " # handle_bof");
self # handle_bof tok'
end
else tok'
end
else tok'
end
(* 'handle_bof' and 'handle_eof' can be used as hooks. Behaviour:
*
* - Normally, the first token t is read in, and 'handle_bof t' is
* called. The return value of this method is what is returned to
* the user.
* - If the EOF has been reached, 'handle_eof' is called.
* - BUT: If the first token is already EOF, 'handle_eof' is called
* ONLY, and 'handle_bof' is NOT called.
*
* The default implementations:
* - handle_bof: does nothing
* - handle_eof: Pops the previous entity from the stack, switches back
* to this entity, and returns the next token of this entity.
*)
method private handle_bof tok =
tok
method private handle_eof =
let mng = self # manager in
begin try
mng # pop_entity;
let next_lex_id = self # close_entity in
let en = mng # current_entity in
en # set_lex_id next_lex_id;
en # next_token
with
Stack.Empty ->
(* The outermost entity is at EOF *)
Eof
end
method next_ignored_token =
(* used after <![ IGNORE *)
(* TODO: Do we need a test on deferred tokens here? *)
let tok, lex_id' = v.lexerset.scan_ignored_section v.lexbuf in
if v.debug then
prerr_endline ("- Entity " ^ v.name ^ ": " ^ string_of_tok tok ^ " (Ignored)");
update_other_lines v tok;
match tok with
| Conditional_begin _ -> Conditional_begin (self :> entity_id)
| Conditional_end _ -> Conditional_end (self :> entity_id)
| _ -> tok
method process_xmldecl pl =
(* The parser calls this method just after the XML declaration
* <?xml ...?> has been detected.
* 'pl': This is the argument of the PI_xml token.
*)
if v.debug then
prerr_endline ("- Entity " ^ v.name ^ " # process_xmldecl");
v.prolog <- Some pl;
v.prolog_pairs <- decode_xml_pi pl;
if v.check_text_declaration then
check_text_xml_pi v.prolog_pairs;
begin
try
let e = List.assoc "encoding" v.prolog_pairs in
self # set_encoding e
with
Not_found ->
self # set_encoding ""
end;
method process_missing_xmldecl =
(* The parser calls this method if the XML declaration is missing *)
if v.debug then
prerr_endline ("- Entity " ^ v.name ^ " # process_missing_xmldecl");
self # set_encoding ""
method ext_id =
(* Returns the external ID for external and NDATA entities. Raises
* Not_found for internal entities
*)
(raise Not_found : ext_id)
(* Methods for NDATA entities only: *)
method notation = (assert false : string)
end
;;
class ndata_entity the_name the_ext_id the_notation init_encoding =
object (self)
(* An NDATA entity is very restricted; more or less you can only find out
* its external ID and its notation.
*)
val mutable name = the_name
val mutable ext_id = the_ext_id
val mutable notation = the_notation
val encoding = (init_encoding : rep_encoding)
method name = (name : string)
method ext_id = (ext_id : ext_id)
method notation = (notation : string)
method is_ndata = true
method encoding = encoding
val mutable counts_as_external = false
method counts_as_external = counts_as_external
(* Whether the entity counts as external (for the standalone check). *)
method set_counts_as_external =
counts_as_external <- true
method set_manager (m : < current_entity : entity;
pop_entity : unit;
push_entity : entity -> unit >) =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: unit )
method set_lex_id (id : lexers) =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: unit )
method line =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: int )
method column =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: int )
method full_name =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: string )
method private set_encoding (_:string) =
assert false
method xml_declaration = (None : (string*string) list option)
method set_debugging_mode (_:bool) = ()
method open_entity (_:bool) (_:lexers) =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: unit )
method close_entity =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: lexers )
method replacement_text =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: (string * bool) )
method lexbuf =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: Lexing.lexbuf )
method next_token =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: token )
method next_ignored_token =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: token )
method process_xmldecl (pl:prolog_token list) =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: unit )
method process_missing_xmldecl =
( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
: unit )
end
;;
class external_entity the_resolver the_dtd the_name the_warner the_ext_id
the_p_special_empty_entities
init_encoding
=
object (self)
inherit entity
the_dtd the_name the_warner
init_encoding
as super
(* An external entity gets the lexbuf that is used as character source
* from a resolver.
* Furthermore, before the first token an Begin_entity is inserted, and
* before Eof an End_entity token is inserted into the stream. This done
* always regardless of the argument 'force_parsing' of the method
* 'open_entity'.
*
* 'the_p_internal_subset': see class internal_entity
* 'the_p_special_empty_entities': if true, a Begin_entity/End_entity
* brace is left out if the entity is otherwise empty.
*)
val resolver = (the_resolver : resolver)
val ext_id = (the_ext_id : ext_id)
val p_special_empty_entities = (the_p_special_empty_entities : bool)
val mutable resolver_is_open = false
(* Track if the resolver is open. This is also used to find recursive
* references of entities.
*)
val mutable state = At_beginning
initializer
v.counts_as_external <- true;
method private set_encoding e =
assert resolver_is_open;
resolver # change_encoding e
method full_name =
v.name ^
match ext_id with
System s -> " = SYSTEM \"" ^ s ^ "\""
| Public(p,s) -> " = PUBLIC \"" ^ p ^ "\" \"" ^ s ^ "\""
| Anonymous -> " = ANONYMOUS"
| Private _ -> " = PRIVATE"
method ext_id = ext_id
method open_entity force_parsing init_lex_id =
(* Note that external entities are always parsed, i.e. Begin_entity
* and End_entity tokens embrace the inner tokens to force that
* the entity is only called where the syntax allows it.
*)
if resolver_is_open then
raise(Validation_error("Recursive reference to entity `" ^ v.name ^ "'"));
let lex =
try
resolver # open_in ext_id
with
Pxp_reader.Not_competent ->
raise(Error ("No input method available for this external entity: " ^
self # full_name))
| Pxp_reader.Not_resolvable Not_found ->
raise(Error ("Unable to open the external entity: " ^
self # full_name))
| Pxp_reader.Not_resolvable e ->
raise(Error ("Unable to open the external entity: " ^
self # full_name ^ "; reason: " ^
string_of_exn e))
in
resolver_is_open <- true;
v.lexbuf <- lex;
v.prolog <- None;
v.lex_id <- init_lex_id;
state <- At_beginning;
v.line <- 1;
v.column <- 0;
v.at_bof <- true;
v.normalize_newline <- true;
method private handle_bof tok =
(* This hook is only called if the stream is not empty. *)
v.deferred_token <- Some [ tok ];
state <- Inserted_begin_entity;
Begin_entity
method private handle_eof =
(* This hook is called if the end of the stream is reached *)
match state with
At_beginning ->
(* This is only possible if the stream is empty. *)
if p_special_empty_entities then begin
(* Continue immediately with the next token *)
state <- At_end;
super # handle_eof
end
else begin
(* Insert Begin_entity / End_entity *)
v.deferred_token <- Some [ End_entity ];
state <- At_end;
Begin_entity;
(* After these two token have been processed, the lexer
* is called again, and it will return another Eof.
*)
end
| Inserted_begin_entity ->
(* Insert End_entity, too. *)
state <- At_end;
End_entity;
| At_end ->
(* Continue with the next token: *)
super # handle_eof
method close_entity =
if not resolver_is_open then
failwith ("External entity " ^ v.name ^ " not open");
resolver # close_in;
resolver_is_open <- false;
v.lex_id
method replacement_text =
(* Return the replacement text of the entity. The method used for this
* is more or less the same as for internal entities; i.e. character
* and parameter entities are resolved immediately. In addition to that,
* external entities may begin with an "xml" processing instruction
* which is considered not to be part of the replacement text.
*)
if resolver_is_open then
raise(Validation_error("Recursive reference to entity `" ^ v.name ^ "'"));
let lex =
try
resolver # open_in ext_id
with
Pxp_reader.Not_competent ->
raise(Error ("No input method available for this external entity: " ^
self # full_name))
| Pxp_reader.Not_resolvable Not_found ->
raise(Error ("Unable to open the external entity: " ^
self # full_name))
| Pxp_reader.Not_resolvable e ->
raise(Error ("Unable to open the external entity: " ^
self # full_name ^ "; reason: " ^
string_of_exn e))
in
resolver_is_open <- true;
v.lexbuf <- lex;
v.prolog <- None;
(* arbitrary: lex_id <- init_lex_id; *)
state <- At_beginning;
v.line <- 1;
v.column <- 0;
v.at_bof <- true;
(* First check if the first token of 'lex' is <?xml...?> *)
begin match v.lexerset.scan_only_xml_decl lex with
PI_xml pl ->
self # process_xmldecl pl
| Eof ->
(* This only means that the first token was not <?xml...?>;
* the "Eof" token represents the empty string.
*)
self # process_missing_xmldecl
| _ ->
(* Must not happen. *)
assert false
end;
(* Then create the replacement text. *)
let rec scan_and_expand () =
match v.lexerset.scan_dtd_string v.lexbuf with
ERef n -> "&" ^ n ^ ";" ^ scan_and_expand()
| CRef(-1) -> "\n" ^ scan_and_expand()
| CRef(-2) -> "\n" ^ scan_and_expand()
| CRef(-3) -> "\n" ^ scan_and_expand()
| CRef k -> character v.encoding v.warner k ^ scan_and_expand()
| CharData x -> x ^ scan_and_expand()
| PERef n ->
let en = v.dtd # par_entity n in
let (x,_) = en # replacement_text in
x ^ scan_and_expand()
| Eof ->
""
| _ ->
assert false
in
let rtext = scan_and_expand() in
resolver # close_in;
resolver_is_open <- false;
rtext, true
(* TODO:
* - The replaced text is not parsed [VALIDATION WEAKNESS]
*)
end
;;
class document_entity the_resolver the_dtd the_name the_warner the_ext_id
init_encoding
=
object (self)
inherit external_entity the_resolver the_dtd the_name the_warner
the_ext_id false
init_encoding
(* A document entity is an external entity that does not allow
* conditional sections, and that forces that internal parameter entities
* are properly nested.
*)
initializer
v.force_parameter_entity_parsing <- true;
v.check_text_declaration <- false;
method counts_as_external = false
(* Document entities count never as external! *)
end
;;
class internal_entity the_dtd the_name the_warner the_literal_value
the_p_internal_subset
init_is_parameter_entity
init_encoding
=
(* An internal entity uses a "literal entity value" as character source.
* This value is first expanded and preprocessed, i.e. character and
* parameter references are expanded.
*
* 'the_p_internal_subset': indicates that the entity is declared in the
* internal subset. Such entity declarations are not allowed to contain
* references to parameter entities.
* 'init_is_parameter_entity': whether this is a parameter entity or not
*)
object (self)
inherit entity
the_dtd the_name the_warner
init_encoding
as super
val p_internal_subset = the_p_internal_subset
val mutable replacement_text = ""
val mutable contains_external_references = false
val mutable p_parsed_actually = false
val mutable is_open = false
val mutable state = At_beginning
val mutable is_parameter_entity = init_is_parameter_entity
initializer
let lexbuf = Pxp_lexing.from_string the_literal_value in
let rec scan_and_expand () =
match v.lexerset.scan_dtd_string lexbuf with
ERef n -> "&" ^ n ^ ";" ^ scan_and_expand()
| CRef(-1) -> "\r\n" ^ scan_and_expand()
| CRef(-2) -> "\r" ^ scan_and_expand()
| CRef(-3) -> "\n" ^ scan_and_expand()
| CRef k -> character v.encoding v.warner k ^ scan_and_expand()
| CharData x -> x ^ scan_and_expand()
| PERef n ->
if p_internal_subset then
raise(WF_error("Restriction of the internal subset: parameter entity not allowed here"));
let en = v.dtd # par_entity n in
let (x, extref) = en # replacement_text in
contains_external_references <-
contains_external_references or extref;
x ^ scan_and_expand()
| Eof ->
""
| _ ->
assert false
in
is_open <- true;
replacement_text <- scan_and_expand();
is_open <- false;
v.normalize_newline <- false;
v.counts_as_external <- false;
method process_xmldecl (pl:prolog_token list) =
raise(Validation_error("The encoding cannot be changed in internal entities"))
method process_missing_xmldecl =
()
method private set_encoding e =
(* Ignored if e = "" *)
assert(e = "");
method open_entity force_parsing init_lex_id =
if is_open then
raise(Validation_error("Recursive reference to entity `" ^ v.name ^ "'"));
p_parsed_actually <- force_parsing;
v.lexbuf <- Pxp_lexing.from_string
(if is_parameter_entity then
(" " ^ replacement_text ^ " ")
else
replacement_text);
v.prolog <- None;
v.lex_id <- init_lex_id;
state <- At_beginning;
is_open <- true;
v.line <- 1;
v.column <- 0;
v.at_bof <- true; (* CHECK: Is this right? *)
method private handle_bof tok =
(* This hook is only called if the stream is not empty. *)
if p_parsed_actually then begin
v.deferred_token <- Some [ tok ];
state <- Inserted_begin_entity;
Begin_entity
end
else begin
state <- At_end;
tok
end
method private handle_eof =
(* This hook is called if the end of the stream is reached *)
match state with
At_beginning ->
(* This is only possible if the stream is empty. *)
if p_parsed_actually then begin
(* Insert Begin_entity / End_entity *)
v.deferred_token <- Some [ End_entity ];
state <- At_end;
Begin_entity;
(* After these two token have been processed, the lexer
* is called again, and it will return another Eof.
*)
end
else begin
(* Continue immediately with the next token *)
state <- At_end;
super # handle_eof
end
| Inserted_begin_entity ->
(* Insert End_entity, too. *)
state <- At_end;
End_entity;
| At_end ->
(* Continue with the next token: *)
super # handle_eof
method close_entity =
if not is_open then
failwith ("Internal entity " ^ v.name ^ " not open");
is_open <- false;
v.lex_id
method replacement_text =
if is_open then
raise(Validation_error("Recursive reference to entity `" ^ v.name ^ "'"));
replacement_text, contains_external_references
end
;;
(**********************************************************************)
(* An 'entity_manager' is a stack of entities, where the topmost entity
* is the currently active entity, the second entity is the entity that
* referred to the active entity, and so on.
*
* The entity_manager can communicate with the currently active entity.
*
* The entity_manager provides an interface for the parser; the functions
* returning the current token and the next token are exported.
*)
class entity_manager (init_entity : entity) =
object (self)
val mutable entity_stack = Stack.create()
val mutable current_entity = init_entity
val mutable current_entity's_full_name = lazy (init_entity # full_name)
val mutable yy_get_next_ref = ref (fun () -> assert false)
initializer
init_entity # set_manager (self :>
< current_entity : entity;
pop_entity : unit;
push_entity : entity -> unit >
);
yy_get_next_ref := (fun () -> init_entity # next_token)
method push_entity e =
e # set_manager (self :>
< current_entity : entity;
pop_entity : unit;
push_entity : entity -> unit >
);
Stack.push (current_entity, current_entity's_full_name) entity_stack;
current_entity <- e;
current_entity's_full_name <- lazy (e # full_name);
yy_get_next_ref := (fun () -> e # next_token);
method pop_entity =
(* May raise Stack.Empty *)
let e, e_name = Stack.pop entity_stack in
current_entity <- e;
current_entity's_full_name <- e_name;
yy_get_next_ref := (fun () -> e # next_token);
method position_string =
(* Gets a string describing the position of the last token;
* includes an entity backtrace
*)
let b = Buffer.create 200 in
Buffer.add_string b
("In entity " ^ current_entity # full_name
^ ", at line " ^ string_of_int (current_entity # line)
^ ", position " ^ string_of_int (current_entity # column)
^ ":\n");
Stack.iter
(fun (e, e_name) ->
Buffer.add_string b
("Called from entity " ^ Lazy.force e_name
^ ", line " ^ string_of_int (e # line)
^ ", position " ^ string_of_int (e # column)
^ ":\n");
)
entity_stack;
Buffer.contents b
method position =
(* Returns the triple (full_name, line, column) of the last token *)
Lazy.force current_entity's_full_name,
current_entity # line,
current_entity # column
method current_entity_counts_as_external =
(* Whether the current entity counts as external to the main
* document for the purpose of stand-alone checks.
*)
(* TODO: improve performance *)
let is_external = ref false in
let check (e, _) =
if e # counts_as_external then begin
is_external := true;
end;
in
check (current_entity,());
Stack.iter check entity_stack;
!is_external
method current_entity = current_entity
method yy_get_next_ref = yy_get_next_ref
end
;;
(* ======================================================================
* History:
*
* $Log: pxp_entity.ml,v $
* Revision 1.16 2002/03/10 23:39:45 gerd
* ext_id works also for external entities.
*
* Revision 1.15 2002/02/20 00:25:23 gerd
* using Pxp_lexing instead of Lexing.
*
* Revision 1.14 2001/06/28 22:42:07 gerd
* Fixed minor problems:
* - Comments must be contained in one entity
* - Pxp_document.document is now initialized with encoding.
* the DTD encoding may be initialized too late.
*
* Revision 1.13 2001/04/22 14:14:41 gerd
* Updated to support private IDs.
*
* Revision 1.12 2001/04/22 12:04:55 gerd
* external_entity, method replacement_text: catches errors
* from pxp_reader and transforms them
*
* Revision 1.11 2000/10/01 19:49:51 gerd
* Numerous optimizations in the class "entity".
*
* Revision 1.10 2000/09/21 21:28:16 gerd
* New token IgnoreLineEnd: simplifies line counting, and
* corrects a bug.
*
* Revision 1.9 2000/09/17 00:11:22 gerd
* Optimized line numbering.
*
* Revision 1.8 2000/09/09 16:39:05 gerd
* Changed comment.
*
* Revision 1.7 2000/09/05 21:52:31 gerd
* class internal_entity: Previously, the method open_entity
* intialized the slot last_token to Eof. This is wrong, because
* this causes that handle_bof is never called. The slot last_token
* is now initialized to Bof.
* Critical negative tests: data_jclark_notwf/not-sa/002,
* data_jclark_notwf/sa/153.xml, data_jclark_notwf/sa/161.xml. The
* error messages of these tests changed (checked; the new messages
* are better).
*
* Revision 1.6 2000/07/14 13:55:00 gerd
* Cosmetic changes.
*
* Revision 1.5 2000/07/09 17:51:50 gerd
* Entities return now the beginning of a token as its
* position.
* New method 'position' for entity_manager.
*
* Revision 1.4 2000/07/09 01:05:04 gerd
* Exported methods 'ext_id' and 'notation' anyway.
*
* Revision 1.3 2000/07/08 16:28:05 gerd
* Updated: Exception 'Not_resolvable' is taken into account.
*
* Revision 1.2 2000/07/04 22:12:47 gerd
* Update: Case ext_id = Anonymous.
* Update: Handling of the exception Not_competent when reading
* from a resolver.
*
* Revision 1.1 2000/05/29 23:48:38 gerd
* Changed module names:
* Markup_aux into Pxp_aux
* Markup_codewriter into Pxp_codewriter
* Markup_document into Pxp_document
* Markup_dtd into Pxp_dtd
* Markup_entity into Pxp_entity
* Markup_lexer_types into Pxp_lexer_types
* Markup_reader into Pxp_reader
* Markup_types into Pxp_types
* Markup_yacc into Pxp_yacc
* See directory "compatibility" for (almost) compatible wrappers emulating
* Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
*
* ======================================================================
* Old logs from markup_entity.ml:
*
* Revision 1.27 2000/05/29 21:14:57 gerd
* Changed the type 'encoding' into a polymorphic variant.
*
* Revision 1.26 2000/05/28 17:24:55 gerd
* Bugfixes.
*
* Revision 1.25 2000/05/27 19:23:32 gerd
* The entities store whether they count as external with
* respect to the standalone check: New methods counts_as_external
* and set_counts_as_external.
* The entity manager can find out whether the current
* entity counts as external: method current_entity_counts_as_external.
*
* Revision 1.24 2000/05/20 20:31:40 gerd
* Big change: Added support for various encodings of the
* internal representation.
*
* Revision 1.23 2000/05/14 21:51:24 gerd
* Change: Whitespace is handled by the grammar, and no longer
* by the entity.
*
* Revision 1.22 2000/05/14 17:50:54 gerd
* Updates because of changes in the token type.
*
* Revision 1.21 2000/05/09 00:02:44 gerd
* Conditional sections are now recognized by the parser.
* There seem some open questions; see the TODO comments!
*
* Revision 1.20 2000/05/08 21:58:22 gerd
* Introduced entity_manager as communication object between
* the parser and the currently active entity.
* New hooks handle_bof and handle_eof.
* Removed "delegated entities". The entity manager contains
* the stack of open entities.
* Changed the way Begin_entity and End_entity are inserted.
* This is now done by handle_bof and handle_eof.
* The XML declaration is no longer detected by the entity.
* This is now done by the parser.
*
* Revision 1.19 2000/05/01 15:18:44 gerd
* Improved CRLF handling in the replacement text of entities.
* Changed one error message.
*
* Revision 1.18 2000/04/30 18:18:39 gerd
* Bugfixes: The conversion of CR and CRLF to LF is now hopefully
* done right. The new variable "normalize_newline" indicates whether
* normalization must happen for that type of entity. The normalization
* if actually carried out separately for every token that needs it.
*
* Revision 1.17 2000/03/13 23:42:38 gerd
* Removed the resolver classes, and put them into their
* own module (Markup_reader).
*
* Revision 1.16 2000/02/22 01:06:58 gerd
* Bugfix: Resolvers are properly re-initialized. This bug caused
* that entities could not be referenced twice in the same document.
*
* Revision 1.15 2000/01/20 20:54:11 gerd
* New config.errors_with_line_numbers.
*
* Revision 1.14 2000/01/08 18:59:03 gerd
* Corrected the string resolver.
*
* Revision 1.13 1999/09/01 22:58:23 gerd
* Method warn_not_latin1 raises Illegal_character if the character
* does not match the Char production.
* External entities that are not document entities check if the
* <?xml...?> declaration at the beginning matches the TextDecl production.
* Method xml_declaration has type ... list option, not ... list.
* Tag_beg and Tag_end now carry an entity_id with them.
* The code to check empty entities has changed. That the Begin_entity/
* End_entity pair is not to be added must be explicitly turned on. See the
* description of empty entity handling in design.txt.
* In internal subsets entity declarations are not allowed to refer
* to parameter entities. The internal_entity class can do this now.
* The p_parsed parameter of internal_entity has gone. It was simply
* superflous.
*
* Revision 1.12 1999/09/01 16:24:13 gerd
* The method replacement_text returns the text as described for
* "included in literal". The former behaviour has been dropped to include
* a leading and a trailing space character for parameter entities.
* Bugfix: When general entities are included, they are always parsed.
*
* Revision 1.11 1999/08/31 19:13:31 gerd
* Added checks on proper PE nesting. The idea is that tokens such
* as Decl_element and Decl_rangle carry an entity ID with them. This ID
* is simply an object of type < >, i.e. you can only test on identity.
* The lexer always produces tokens with a dummy ID because it does not
* know which entity is the current one. The entity layer replaces the dummy
* ID with the actual ID. The parser checks that the IDs of pairs such as
* Decl_element and Decl_rangle are the same; otherwise a Validation_error
* is produced.
*
* Revision 1.10 1999/08/19 01:06:41 gerd
* Improved error messages: external entities print their
* ext id, too
*
* Revision 1.9 1999/08/15 20:35:48 gerd
* Improved error messages.
* Before the tokens Plus, Star, Qmark space is not allowed any longer.
* Detection of recursive entity references is a bit cleaner.
*
* Revision 1.8 1999/08/15 15:33:44 gerd
* Revised whitespace checking: At certain positions there must be
* white space. These checks cannot be part of the lexer, as %entity; counts
* as white space. They cannot be part of the yacc parser because one look-ahead
* token would not suffice if we did that. So these checks must be done by the
* entity layer. Luckily, the rules are simple: There are simply a number of
* token pairs between which white space must occur independently of where
* these token have been found. Two variables, "space_seen", and "last_token"
* have been added in order to check these rules.
*
* Revision 1.7 1999/08/15 00:41:06 gerd
* The [ token of conditional sections is now allowed to occur
* in a different entity.
*
* Revision 1.6 1999/08/15 00:29:02 gerd
* The method "attlist_replacement_text" has gone. There is now a
* more general "replacement_text" method that computes the replacement
* text for both internal and external entities. Additionally, this method
* returns whether references to external entities have been resolved;
* this is checked in the cases where formerly "attlist_replacement_text"
* was used as it is not allowed everywhere.
* Entities have a new slot "need_spaces" that indicates that the
* next token must be white space or a parameter reference. The problem
* was that "<!ATTLIST%e;" is legal because when including parameter
* entities white space is added implicitly. Formerly, the white space
* was expected by the underlying lexer; now the lexer does not check
* anymore that "<!ATTLIST" is followed by white space because the lexer
* cannot handle parameter references. Because of this, the check on
* white space must be done by the entity.
*
* Revision 1.5 1999/08/14 22:57:19 gerd
* It is allowed that external entities are empty because the
* empty string is well-parsed for both declarations and contents. Empty
* entities can be referenced anywhere because the references are replaced
* by nothing. Because of this, the Begin_entity...End_entity brace is only
* inserted if the entity is non-empty. (Otherwise references to empty
* entities would not be allowed anywhere.)
* As a consequence, the grammar has been changed such that a
* single Eof is equivalent to Begin_entity,End_entity without content.
*
* Revision 1.4 1999/08/14 22:11:19 gerd
* Several objects have now a "warner" as argument which is
* an object with a "warn" method. This is used to warn about characters
* that cannot be represented in the Latin 1 alphabet.
* Previously, the resolvers had features in order to warn about
* such characters; this has been removed.
* UTF-8 streams can be read even if they contain characters
* that cannot be represented by 16 bits.
* The buffering used in the resolvers is now solved in a
* cleaner way; the number of characters that are expected to be read
* from a source can be limited. This removes a bug with UTF-16 streams
* that previously lead to wrong exceptions; and the buffering is more
* efficient, too.
*
* Revision 1.3 1999/08/11 14:58:53 gerd
* Some more names for encodings are allowed, such as "utf8" instead
* of the standard name "UTF-8".
* 'resolve_as_file' interprets relative file names as relative to
* the "parent" resolver.
*
* Revision 1.2 1999/08/10 21:35:07 gerd
* The XML/encoding declaration at the beginning of entities is
* evaluated. In particular, entities have now a method "xml_declaration"
* which returns the name/value pairs of such a declaration. The "encoding"
* setting is interpreted by the entity itself; "version", and "standalone"
* are interpreted by Markup_yacc.parse_document_entity. Other settings
* are ignored (this does not conform to the standard; the standard prescribes
* that "version" MUST be given in the declaration of document; "standalone"
* and "encoding" CAN be declared; no other settings are allowed).
* TODO: The user should be warned if the standard is not exactly
* fulfilled. -- The "standalone" property is not checked yet.
*
* Revision 1.1 1999/08/10 00:35:51 gerd
* Initial revision.
*
*
*)