(* $Id: pxp_aux.ml,v 1.15 2002/03/13 22:45:42 gerd Exp $
* ----------------------------------------------------------------------
* PXP: The polymorphic XML parser for Objective Caml.
* Copyright by Gerd Stolpmann. See LICENSE for details.
* Some auxiliary functions
*)
(**********************************************************************)
(* Lexing *)
open Pxp_types
open Pxp_lexer_types
open Pxp_lexers
open Pxp_lib
open Netconversion
module HashedString = struct
type t = string
let equal (a:string) (b:string) = (a = b)
let hash = Hashtbl.hash
end;;
module Str_hashtbl = Hashtbl.Make(HashedString);;
module StringOrd = struct
type t = string
let compare = (compare : string -> string -> int)
end;;
module StringMap = Map.Make(StringOrd);;
(* 'a StringMap.t: the type of maps (dictionaries) from string to 'a *)
let character enc warner k =
assert (k>=0);
if (k >= 0xd800 & k < 0xe000) or (k >= 0xfffe & k <= 0xffff) or k > 0x10ffff
or (k < 8) or (k = 11) or (k = 12) or (k >= 14 & k <= 31)
then
raise (WF_error("Code point " ^ string_of_int k ^
" outside the accepted range of code points"));
try
makechar (enc : rep_encoding :> encoding) k
with
Not_found ->
warner # warn ("Code point cannot be represented in internal encoding: "
^ string_of_int k);
""
;;
let check_name warner name =
(* produces a warning for names beginning with "xml". *)
if String.length name >= 3 then begin
match String.sub name 0 3 with
("xml" | "xmL" | "xMl" | "xML" | "Xml" | "XmL" | "XMl" | "XML") ->
warner # warn ("Name is reserved for future extensions: " ^ name)
| _ ->
()
end
;;
let tokens_of_content_string lexerset s =
(* tokenizes general entities and character entities *)
let lexbuf = Pxp_lexing.from_string_inplace s in
let rec next_token () =
match lexerset.scan_content_string lexbuf with
Eof -> []
| CharData _ -> let tok = CharData (Lexing.lexeme lexbuf) in
tok :: next_token()
| tok -> tok :: next_token()
in
next_token()
;;
exception Quick_exit;;
let rec expand_attvalue_with_rec_check lexbuf l lexerset dtd warner entities norm_crlf =
(* recursively expands general entities and character entities;
* checks "standalone" document declaration;
* normalizes whitespace
*
* Exception: Quick_exit: the expanded value is equal to s
*)
match lexerset.scan_content_string lexbuf with
Eof -> []
| ERef n ->
if List.mem n entities then
raise(WF_error("Recursive reference to general entity `" ^ n ^ "'"));
let en, extdecl = dtd # gen_entity n in
if dtd # standalone_declaration && extdecl then
raise(Validation_error("Reference to entity `" ^ n ^
"' violates standalone declaration"));
let rtext, rtext_contains_ext_refs = en # replacement_text in
if rtext_contains_ext_refs then
raise(Validation_error("Found reference to external entity in attribute value"));
let l' =
try
expand_attvalue_with_rec_check
(Pxp_lexing.from_string_inplace rtext)
(String.length rtext)
lexerset dtd warner (n :: entities) false
with
Quick_exit -> [rtext]
in
l' @ expand_attvalue_with_rec_check
lexbuf l lexerset dtd warner entities norm_crlf
| CRef(-1) ->
if norm_crlf then begin
" " :: expand_attvalue_with_rec_check
lexbuf l lexerset dtd warner entities norm_crlf
end
else begin
" " :: expand_attvalue_with_rec_check
lexbuf l lexerset dtd warner entities norm_crlf
end
| CRef n ->
(character lexerset.lex_encoding warner n) ::
expand_attvalue_with_rec_check
lexbuf l lexerset dtd warner entities norm_crlf
| CharData _ ->
if Lexing.lexeme_char lexbuf 0 = '<' then
raise
(WF_error
("Attribute value contains character '<' literally"));
if Lexing.lexeme_start lexbuf = 0 &&
Lexing.lexeme_end lexbuf = l then
raise Quick_exit
else
let x = Lexing.lexeme lexbuf in
x :: expand_attvalue_with_rec_check
lexbuf l lexerset dtd warner entities norm_crlf
| _ -> assert false
;;
let expand_attvalue lexbuf lexerset dtd s warner norm_crlf =
(* norm_crlf: whether the sequence CRLF is recognized as one character or
* not (i.e. two characters).
* lexbuf: must result from a previous Lexing.from_string
*)
try
Pxp_lexing.from_another_string_inplace lexbuf s;
let l =
expand_attvalue_with_rec_check
lexbuf (String.length s) lexerset dtd warner [] norm_crlf in
String.concat "" l
with
Quick_exit ->
s
;;
type linecount =
{ mutable lines : int;
mutable columns : int;
}
;;
let rec count_lines_aux lc s n k =
let next_cr_or_lf = crlf_index_from s k in
if next_cr_or_lf >= 0 then begin
match s.[next_cr_or_lf] with
'\010' ->
count_lines_aux lc s (n+1) (next_cr_or_lf+1)
| '\013' ->
let l = String.length s in
if (next_cr_or_lf+1 < l && s.[next_cr_or_lf+1] = '\010') then
count_lines_aux lc s (n+1) (next_cr_or_lf+2)
else
count_lines_aux lc s (n+1) (next_cr_or_lf+1)
| _ ->
assert false
end
else begin
lc.lines <- n;
lc.columns <- String.length s - k;
end
;;
let count_lines lc s =
(* modifies lc: number of lines in s, number of columns of the last line *)
count_lines_aux lc s 0 0
;;
let tokens_of_xml_pi lexers s =
let lexbuf = Pxp_lexing.from_string_inplace (s ^ " ") in
let rec collect () =
let t = lexers.scan_xml_pi lexbuf in
match t with
Pro_eof -> []
| _ -> t :: collect()
in
collect()
;;
let decode_xml_pi pl =
(* 'pl' must consist of name="value" or name='value' pairs which are returned
* as list of pairs.
* The "value" is returned as it is; no substitution of &entities; happens.
*)
let rec decode pl =
match pl with
Pro_name name :: Pro_eq :: Pro_string value :: pl' ->
(name, value) :: decode pl'
| [] ->
[]
| _ ->
raise (WF_error("Bad XML processing instruction"))
in
decode pl
;;
let decode_doc_xml_pi pl =
match pl with
[ "version", v ] -> (v, None, None)
| [ "version", v; "encoding", e ] -> (v, Some e, None)
| [ "version", v; "standalone", s ] -> (v, None, Some s)
| [ "version", v; "encoding", e; "standalone", s ] -> (v, Some e, Some s)
| _ ->
raise(WF_error("Bad XML declaration"))
;;
let check_text_xml_pi pl =
match pl with
| [ "version", v; "encoding", e ] -> ()
| [ "encoding", e ] -> ()
| _ ->
raise(WF_error("Bad XML declaration"))
;;
let check_version_num s =
let l = String.length s in
for i = 0 to l - 1 do
match s.[i] with
('a'..'z'|'A'..'Z'|'0'..'9'|
'-'|'_'|'.'|':') -> ()
| _ ->
raise(WF_error("Bad XML version string"))
done
;;
let check_public_id s =
let l = String.length s in
for i = 0 to l - 1 do
match s.[i] with
(' '|'\013'|'\010'|'a'..'z'|'A'..'Z'|'0'..'9'|
'-'|'\''|'('|')'|'+'|','|'.'|'/'|':'|'='|'?'|
';'|'!'|'*'|'#'|'@'|'$'|'_'|'%') -> ()
| _ ->
raise(WF_error("Illegal character in PUBLIC identifier"))
done
;;
(**********************************************************************)
(* list functions *)
let rec check_dups l =
match l with
[] -> false
| c :: l' ->
if List.mem c l' then true else check_dups l'
;;
let rec count pred l =
match l with
[] -> 0
| x :: l' ->
if pred x then 1 + (count pred l') else count pred l'
;;
(**********************************************************************)
(* attributes *)
let check_attribute_value_lexically lexerset x t v =
(* raises x if the attribute value v does not match the lexical rules
* for attribute type t:
* - t = A_id: v must be a <name>
* - t = A_idref: v must match <name>
* - t = A_idrefs: v must match <names>
* - t = A_entity: v must match <name>
* - t = A_entities: v must match <names>
* - t = A_nmtoken: v must match <nmtoken>
* - t = A_nmtokens: v must match <nmtokens>
* - t = A_notation _: v must match <name>
* - t = A_enum _: v must match <nmtoken>
* - t = A_cdata: not checked
*)
let lexbuf = Pxp_lexing.from_string_inplace v in
let rec get_name_list() =
match lexerset.scan_name_string lexbuf with
Eof -> []
| Ignore -> get_name_list()
| tok -> tok :: get_name_list()
in
let l = get_name_list() in
match t with
(A_id | A_idref | A_entity | A_notation _) ->
begin match l with
[ Name n ] -> ()
| _ -> raise (Lazy.force x)
end
| (A_idrefs | A_entities) ->
if List.exists (fun tok ->
match tok with
Name _ -> false
| _ -> true) l then
raise (Lazy.force x)
| (A_nmtoken | A_enum _) ->
begin match l with
[ Name n ] -> ()
| [ Nametoken n ] -> ()
| _ -> raise (Lazy.force x)
end
| A_nmtokens ->
if List.exists (fun tok ->
match tok with
Name _ -> false
| Nametoken _ -> false
| _ -> true
) l then
raise (Lazy.force x)
| _ -> ()
;;
let split_attribute_value lexerset v =
(* splits 'v' into a list of names or nmtokens. The white space separating
* the names/nmtokens in 'v' is suppressed and not returned.
*)
let lexbuf = Pxp_lexing.from_string_inplace v in
let rec get_name_list() =
match lexerset.scan_name_string lexbuf with
Eof -> []
| Ignore -> get_name_list()
| Name s -> s :: get_name_list()
| Nametoken s -> s :: get_name_list()
| _ -> raise(Validation_error("Illegal attribute value"))
in
get_name_list()
;;
let rev_concat l =
(* = String.concat "" (List.rev l) *)
let k = ref 0 in
List.iter
(fun s -> k := !k + String.length s)
l;
let r = String.create !k in
List.iter
(fun s ->
let n = String.length s in
k := !k - n;
String.(*unsafe_*)blit s 0 r !k n;
)
l;
assert(!k = 0);
r
;;
let normalize_line_separators lexerset s =
(* Note: Returns [s] if [s] does not contain LFs *)
let lexbuf = Pxp_lexing.from_string_inplace s in
let rec get_string l =
match lexerset.scan_for_crlf lexbuf with
Eof -> l
| CharData s -> get_string (s::l)
| _ -> assert false
in
match get_string [] with
[] -> ""
| [s] -> s
| l -> rev_concat l
;;
let value_of_attribute_aux lexerset dtd n atype v =
(* See value_of_attribute below. *)
let lexical_error() =
lazy (raise(Validation_error("Attribute `" ^ n ^ "' is lexically malformed"))) in
let remove_leading_and_trailing_spaces u =
(* Precondition: 'u' matches <name> or <nmtoken> *)
match split_attribute_value lexerset u with
[ u' ] -> u'
| _ -> assert false
in
let check_ndata_entity u =
let en, extdecl = dtd # gen_entity u in (* or Validation_error *)
if not (en # is_ndata) then
raise(Validation_error("Reference to entity `" ^ u ^
"': NDATA entity expected"));
if dtd # standalone_declaration && extdecl then
raise(Validation_error("Reference to entity `" ^ u ^
"' violates standalone declaration"));
in
match atype with
A_cdata ->
Value v
| (A_id | A_idref | A_nmtoken) ->
check_attribute_value_lexically lexerset (lexical_error()) atype v;
Value (remove_leading_and_trailing_spaces v)
| A_entity ->
check_attribute_value_lexically lexerset (lexical_error()) atype v;
let v' = remove_leading_and_trailing_spaces v in
check_ndata_entity v';
Value v'
| (A_idrefs | A_nmtokens) ->
check_attribute_value_lexically lexerset (lexical_error()) atype v;
Valuelist (split_attribute_value lexerset v)
| A_entities ->
check_attribute_value_lexically lexerset (lexical_error()) atype v;
let l = split_attribute_value lexerset v in
List.iter check_ndata_entity l;
Valuelist l
| A_notation nl ->
check_attribute_value_lexically lexerset (lexical_error()) atype v;
let v' = remove_leading_and_trailing_spaces v in
if not (List.mem v' nl) then
raise(Validation_error
("Attribute `" ^ n ^
"' does not match one of the declared notation names"));
Value v'
| A_enum enuml ->
check_attribute_value_lexically lexerset (lexical_error()) atype v;
let v' = remove_leading_and_trailing_spaces v in
if not (List.mem v' enuml) then
raise(Validation_error
("Attribute `" ^ n ^
"' does not match one of the declared enumerator tokens"));
Value v'
;;
let value_of_attribute lexerset dtd n atype v =
(* The attribute with name 'n', type 'atype' and string value 'v' is
* decomposed, and the att_value is returned:
* - It is checked whether 'v' conforms to the lexical rules for attributes
* of type 'atype'
* - If 'atype <> A_cdata', leading and trailing spaces are removed from 'v'.
* - If 'atype = A_notation d', it is checked if 'v' matches one of the
* notation names contained in d.
* - If 'atype = A_enum d', it is checked whether 'v' matches one of the
* tokens from d
* - If 'atype' refers to a "single-value" type, the value is retured as
* Value u, where u is the normalized value. If 'atype' refers to a
* "list" type, the value if returned as Valuelist l, where l contains
* the tokens.
*
* Note that this function does not implement all normalization rules.
* It is expected that the string passed as 'v' is already preprocessed;
* i.e. character and entity references are resolved, and the substitution
* of white space characters by space characters has already been performed.
* If these requirements are met, the value returned by this function
* will be perfectly normalized.
*
* Further checks:
* - ENTITY and ENTITIES values: It is checked whether there is an
* unparsed general entity
* [ Other checks planned: ID, IDREF, IDREFS but not yet implemented ]
*)
if atype = A_cdata then
(* The most frequent case *)
Value v
else
value_of_attribute_aux lexerset dtd n atype v
(* Note that value_of_attribute_aux allocates memory for the local
* functions even if they are not called at all.
*)
;;
let check_value_of_attribute lexerset dtd n atype av =
(* This function checks whether the decomposed attribute value av
* matches the attribute type, i.e. it checks whether av is the
* result of the above function value_of_attribute for some
* unprocessed value v.
* If the check fails, a validation error is indicated.
*)
let lexical_error() =
lazy (raise(Validation_error("Attribute `" ^ n ^ "' is lexically malformed"))) in
let unexpected_valuelist() =
raise(Validation_error("A list value cannot be assigned to attribute `" ^
n ^ "'"))
in
let unexpected_value() =
raise(Validation_error("A non-list value cannot be assigned to attribute `" ^
n ^ "'"))
in
let no_leading_and_trailing_spaces u =
if u <> "" then begin
let is_whitespace c =
c = ' ' || c = '\t' || c = '\r' || c = '\n' in
if (is_whitespace u.[0]) || (is_whitespace u.[ String.length u - 1 ])
then
raise(Validation_error("Attribute `" ^ n ^ "' has leading or trailing whitespace"))
end
in
let check_ndata_entity u =
let en, extdecl = dtd # gen_entity u in (* or Validation_error *)
if not (en # is_ndata) then
raise(Validation_error("Reference to entity `" ^ u ^
"': NDATA entity expected"));
if dtd # standalone_declaration && extdecl then
raise(Validation_error("Reference to entity `" ^ u ^
"' violates standalone declaration"));
in
match atype with
A_cdata ->
(match av with
| Valuelist _ -> unexpected_valuelist()
| _ -> ()
)
| (A_id | A_idref | A_nmtoken) ->
(match av with
| Valuelist _ -> unexpected_valuelist()
| Value v ->
check_attribute_value_lexically lexerset (lexical_error()) atype v;
no_leading_and_trailing_spaces v
| _ -> ()
)
| A_entity ->
(match av with
| Valuelist _ -> unexpected_valuelist()
| Value v ->
check_attribute_value_lexically lexerset (lexical_error()) atype v;
no_leading_and_trailing_spaces v;
check_ndata_entity v
| _ -> ()
)
| (A_idrefs | A_nmtokens) ->
(match av with
| Value _ -> unexpected_value()
| Valuelist l ->
List.iter no_leading_and_trailing_spaces l;
let subst_type =
if atype = A_idrefs then A_id else A_nmtoken in
List.iter
(check_attribute_value_lexically
lexerset (lexical_error()) subst_type)
l
| _ -> ()
)
| A_entities ->
(match av with
| Value _ -> unexpected_value()
| Valuelist l ->
List.iter no_leading_and_trailing_spaces l;
List.iter
(check_attribute_value_lexically
lexerset (lexical_error()) A_entity)
l;
List.iter check_ndata_entity l
| _ -> ()
)
| A_notation nl ->
(match av with
| Valuelist _ -> unexpected_valuelist()
| Value v ->
check_attribute_value_lexically lexerset (lexical_error()) atype v;
no_leading_and_trailing_spaces v;
if not (List.mem v nl) then
raise(Validation_error
("Attribute `" ^ n ^
"' does not match one of the declared notation names"));
| _ -> ()
)
| A_enum enuml ->
(match av with
| Valuelist _ -> unexpected_valuelist()
| Value v ->
check_attribute_value_lexically lexerset (lexical_error()) atype v;
no_leading_and_trailing_spaces v;
if not (List.mem v enuml) then
raise(Validation_error
("Attribute `" ^ n ^
"' does not match one of the declared enumerator tokens"));
| _ -> ()
)
;;
let normalization_changes_value lexerset atype v =
(* Returns true if:
* - 'atype' is a "single-value" type, and the normalization of the string
* value 'v' of this type discards leading and/or trailing spaces
* - 'atype' is a "list" type, and the normalization of the string value
* 'v' of this type discards leading and/or trailing spaces, or spaces
* separating the tokens of the list (i.e. the normal form is that
* the tokens are separated by exactly one space character).
*
* Note: It is assumed that TABs, CRs, and LFs in 'v' are already converted
* to spaces.
*)
match atype with
A_cdata ->
false
| (A_id | A_idref | A_entity | A_nmtoken | A_notation _ | A_enum _) ->
(* Return 'true' if the first or last character is a space.
* The following check works for both ISO-8859-1 and UTF-8.
*)
v <> "" && (v.[0] = ' ' || v.[String.length v - 1] = ' ')
| (A_idrefs | A_entities | A_nmtokens) ->
(* Split the list, and concatenate the tokens as required by
* the normal form. Return 'true' if this operation results in
* a different string than 'v'.
* This check works for both ISO-8859-1 and UTF-8.
*)
let l = split_attribute_value lexerset v in
let v' = String.concat " " l in
v <> v'
;;
(**********************************************************************)
let re_multi_ws = Netstring_str.regexp "[ \010\013\009]+";;
let re_beg_ws = Netstring_str.regexp "^[ \010\013\009]+";;
let re_end_ws = Netstring_str.regexp "[ \010\013\009]+$";;
let normalize_public_id s =
(* Replaces multiple occurrences of white space characters by a single
*  , and removes leading and trailing white space.
*)
let s1 = Netstring_str.global_replace
re_multi_ws
" "
s in
let s2 = Netstring_str.global_replace
re_beg_ws
""
s1 in
let s3 = Netstring_str.global_replace
re_end_ws
""
s2 in
s3
;;
(**********************************************************************)
let namespace_split name =
(* Searches ':' in name and returns (prefix, localname).
* If there is no ':', prefix = "".
*)
try
let n = String.index name ':' in (* may raise Not_found *)
let prefix = String.sub name 0 n in
let localname = String.sub name (n+1) (String.length name - n - 1)
in
(prefix, localname)
with
Not_found -> ("", name)
;;
let extract_prefix name =
(* Returns the prefix of a name, or "" if there is no colon *)
try
let n = String.index name ':' in (* may raise Not_found *)
String.sub name 0 n
with
Not_found -> ""
;;
(**********************************************************************)
let write_markup_string ~(from_enc:rep_encoding) ~to_enc os s =
(* Write the 'from_enc'-encoded string 's' as 'to_enc'-encoded string to
* 'os'. All characters are written as they are.
*)
let s' =
if to_enc = (from_enc :> encoding)
then s
else recode_string
~in_enc:(from_enc :> encoding)
~out_enc:to_enc
~subst:(fun n ->
failwith
("Pxp_aux.write_markup_string: Cannot represent " ^
"code point " ^ string_of_int n))
s
in
write os s' 0 (String.length s')
;;
let write_data_string ~(from_enc:rep_encoding) ~to_enc os content =
(* Write the 'from_enc'-encoded string 's' as 'to_enc'-encoded string to
* 'os'. The characters '&', '<', '>', '"', '%' and every character that
* cannot be represented in 'to_enc' are paraphrased as entity reference
* "&...;".
*)
let convert_ascii s =
(* Convert the ASCII-encoded string 's'. Note that 'from_enc' is
* always ASCII-compatible
*)
if to_enc = (from_enc :> encoding)
then s
else
recode_string
~in_enc:(from_enc :> encoding)
~out_enc:to_enc
~subst:(fun n -> assert false)
s
in
let write_ascii s =
(* Write the ASCII-encoded string 's' *)
let s' = convert_ascii s in
write os s' 0 (String.length s')
in
let write_part j l =
(* Writes the substring of 'content' beginning at pos 'j' with length 'l'
*)
if to_enc = (from_enc :> encoding) then
write os content j l
else begin
let s' = recode_string
~in_enc:(from_enc :> encoding)
~out_enc:to_enc
~subst:(fun n ->
convert_ascii ("&#" ^ string_of_int n ^ ";"))
(String.sub content j l)
in
write os s' 0 (String.length s')
end
in
let i = ref 0 in
for k = 0 to String.length content - 1 do
match content.[k] with
('&' | '<' | '>' | '"' | '%') as c ->
if !i < k then
write_part !i (k - !i);
begin match c with
'&' -> write_ascii "&"
| '<' -> write_ascii "<"
| '>' -> write_ascii ">"
| '"' -> write_ascii """
| '%' -> write_ascii "%" (* reserved in DTDs *)
| _ -> assert false
end;
i := k+1
| _ -> ()
done;
if !i < String.length content then
write_part !i (String.length content - !i)
;;
(* ======================================================================
* History:
*
* $Log: pxp_aux.ml,v $
* Revision 1.15 2002/03/13 22:45:42 gerd
* Improved Pxp_lexing.
*
* Revision 1.14 2002/02/20 00:42:23 gerd
* New: rev_concat
* Improved normalize_line_separators; does no longer blow up
* memory if many lines are found
*
* Revision 1.13 2001/10/12 21:38:14 gerd
* Changes for O'caml 3.03-alpha.
*
* Revision 1.12 2001/06/27 23:34:54 gerd
* StringMap now here.
* New function extract_prefix.
*
* Revision 1.11 2001/06/07 22:51:10 gerd
* New function: check_value_of_attribute (needed for
* ~att_values option of create_element_node)
*
* Revision 1.10 2001/05/17 21:38:53 gerd
* New function namespace_split.
*
* Revision 1.9 2001/02/01 20:38:49 gerd
* New support for PUBLIC identifiers.
*
* Revision 1.8 2000/10/01 19:45:28 gerd
* New module: Str_hashtbl.
* Many optimizations, especially expand_attvalue and count_lines.
*
* Revision 1.7 2000/09/17 00:10:59 gerd
* Optimizations in expand_attvalue
*
* Revision 1.6 2000/08/14 22:24:55 gerd
* Moved the module Pxp_encoding to the netstring package under
* the new name Netconversion.
*
* Revision 1.5 2000/07/25 00:30:01 gerd
* Added support for pxp:dtd PI options.
*
* Revision 1.4 2000/07/16 18:31:09 gerd
* The exception Illegal_character has been dropped.
*
* Revision 1.3 2000/07/16 16:33:57 gerd
* New function write_markup_string: Handles the encoding
* of the string.
*
* Revision 1.2 2000/07/08 22:15:45 gerd
* [Merging 0.2.10:] write_data_string: The character '%' is special, too.
*
* Revision 1.1 2000/05/29 23:48:38 gerd
* Changed module names:
* Markup_aux into Pxp_aux
* Markup_codewriter into Pxp_codewriter
* Markup_document into Pxp_document
* Markup_dtd into Pxp_dtd
* Markup_entity into Pxp_entity
* Markup_lexer_types into Pxp_lexer_types
* Markup_reader into Pxp_reader
* Markup_types into Pxp_types
* Markup_yacc into Pxp_yacc
* See directory "compatibility" for (almost) compatible wrappers emulating
* Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
*
* ======================================================================
* Old logs from markup_aux.ml:
*
* Revision 1.12 2000/05/27 19:08:30 gerd
* Added functionality to check standalone declaration:
*
* expand_attvalue: Checks whether included entities violate the
* stand-alone declaration.
*
* value_of_attribute: Checks whether ENTITY/ENTITIES values violate
* this declaration. (Furthermore, it is checked whether the NDATA
* entity exists - this has been forgotten in previous versions.)
*
* value_of_attribute/check_attribute_value_lexically: improved.
*
* New function normalization_changes_value: helps detecting
* one case which violates the standalone declaration.
*
* Revision 1.11 2000/05/20 20:31:40 gerd
* Big change: Added support for various encodings of the
* internal representation.
*
* Revision 1.10 2000/05/01 20:41:56 gerd
* New function write_data_string.
*
* Revision 1.9 2000/04/30 18:11:31 gerd
* New function normalize_line_separators.
* In function expand_attvalue: New argument norm_crlf. If the attvalue
* is read directly from a file, the sequence CR LF must be converted to a
* single space. If the attvalue is read from a replacement text, CR LF has
* already converted to a single LF, and CR LF, if still occurring, must be
* converted to two spaces. The caller can indicate the case by passing
* true/false as norm_crlf.
*
* Revision 1.8 1999/09/01 22:51:07 gerd
* Added functions.
* 'character' raises Illegal_character if characters are found that
* do not match the production Char.
*
* Revision 1.7 1999/09/01 16:17:37 gerd
* Added function 'check_name'.
*
* Revision 1.6 1999/08/15 20:33:19 gerd
* Added: a function that checks public identifiers. Only certain
* characters may occur in these identifiers.
* Control characters are rejected by the "character" function.
* Bugfix: recursive entity references are detected in attribute
* expansion
*
* Revision 1.5 1999/08/15 02:18:02 gerd
* That '<' is not allowed in attribute values, is a violation
* of well-formedness, not of the validity; so WF_error is raised.
*
* Revision 1.4 1999/08/15 00:20:37 gerd
* When expanding attribute values, references to parameter
* entities are now resolved by the method "replacement_text" which
* has an additional return value, and no longer by "attlist_replacement_text".
* The new return value indicates whether references to external entities
* have been resolved (directly or indirectly); this is allowed at some
* locations but not in attribute values.
*
* Revision 1.3 1999/08/14 22:05:53 gerd
* Several functions have now a "warner" as argument which is
* an object with a "warn" method. This is used to warn about characters
* that cannot be represented in the Latin 1 alphabet.
*
* Revision 1.2 1999/08/10 21:35:06 gerd
* The XML/encoding declaration at the beginning of entities is
* evaluated. In particular, entities have now a method "xml_declaration"
* which returns the name/value pairs of such a declaration. The "encoding"
* setting is interpreted by the entity itself; "version", and "standalone"
* are interpreted by Markup_yacc.parse_document_entity. Other settings
* are ignored (this does not conform to the standard; the standard prescribes
* that "version" MUST be given in the declaration of document; "standalone"
* and "encoding" CAN be declared; no other settings are allowed).
* TODO: The user should be warned if the standard is not exactly
* fulfilled. -- The "standalone" property is not checked yet.
*
* Revision 1.1 1999/08/10 00:35:50 gerd
* Initial revision.
*
*
*)