(* $Id: pxp_marshal.ml,v 1.9 2002/03/10 23:40:30 gerd Exp $
* ----------------------------------------------------------------------
*
*)
open Pxp_types
open Pxp_document
open Pxp_dtd
open Pxp_aux
open Pxp_yacc
type reconstruction_cmd =
Head of (string * bool) (* Encoding, standalone? *)
| Start_data_node of string
| Declare_element_exemplar of (int * string) (* number, eltype *)
| Declare_attribute of (int * string) (* number, name *)
| Start_element_node of ( (string * int * int) option * (* position *)
int * (* exemplar number *)
(int * att_value) list) (* attributes *)
| Start_super_root_node of ( (string * int * int) option ) (* position *)
| Start_comment_node of ( (string * int * int) option * (* position *)
string option ) (* comment text *)
| Start_pinstr_node of ( (string * int * int) option * (* position *)
string * (* target *)
string ) (* PI text *)
| Add_pinstr of (string * string) (* target, text *)
| End_node
| DTD_string of (string * (* root element *)
dtd_id *
string) (* The declarations as XML text *)
| Document of string (* XML version *)
| Root (* delimiter *)
| Cmd_array of reconstruction_cmd array
| Namespace_mapping of (string * string array)
(* normprefix, URIs (the last is the primary URI) *)
;;
(* Note: Marshalling is new in PXP 1.1 (did not exist in 1.0). The type
* reconstruction_cmd changed several times during the development of
* this module.
*)
(* A node is represented as sequence of reconstruction_cmd values in the
* following way:
*
* - Head
* - Start_xxx_node ...
* Begins the representation of the node with type xxx
* - Add_pinstr ...
* If necessary, one or several processing instructions (not for pinstr nodes)
* - Now the sub nodes in order: Start_xxx_node ... End_node
* - End_node
*
* If namespaces are used, the namespace_manager is prepended to the first
* (outermost) node of the tree. It is represented as a sequence of
* Namespace_mapping commands:
*
* - Namespace_mapping ...
* - Namespace_mapping ...
* - ...
*
* The namespace_info objects are (currently) not marshalled.
*
* A DTD is represented as a single DTD_string command.
*
* A Document is represented by:
* - Head
* - Document ...
* - DTD_string ...
* - Add_pinstr ...
* - Root
* - The root node: Start_xxx_node ... End_node
*)
let id (s:string) = s;;
exception Interruption
type 'ext job =
Marshal_node of 'ext Pxp_document.node
| Marshal_cmd of reconstruction_cmd
(* A job list is the plan that will marshal the rest of the whole
* tree
*)
type jobber =
Done
| Task of (unit -> jobber)
let recode_string ~in_enc ~out_enc =
Netconversion.recode_string
~in_enc
~out_enc
~subst:(fun k ->
failwith ("Pxp_marshal: Cannot recode code point " ^
string_of_int k ^ " from encoding " ^
Netconversion.string_of_encoding in_enc ^
" to encoding " ^
Netconversion.string_of_encoding out_enc))
;;
let subtree_to_cmd_sequence_nohead ~omit_positions ~recode write n : jobber =
(* Calls [write] for every command to write. The function [write] may
* raise [Interruption] to indicate that the job should be interrupted.
* This exception may be ignored, however, so it is necessary to raise
* it several times until it can be honoured.
* The result value of this function is either [Done] meaning that
* all work is done, or it is [Task f] meaning that the work is interrupted
* and may be restarted by executing f() (resulting again in Done or Task).
*)
let m = 100 in
let current_array = Array.create m End_node in (* Collects up to [m] cmds *)
let current_pos = ref 0 in (* next free index *)
let write_nobreak cmd =
(* Call [write] but ignore interruptions *)
if !current_pos < Array.length current_array then begin
current_array.( !current_pos ) <- cmd;
incr current_pos
end
else begin
( try
write (Cmd_array(Array.copy current_array))
with
Interruption -> () (* Ignore here *)
);
current_array.( 0 ) <- cmd;
current_pos := 1;
end
in
let breakpoint()=
(* Returns [true] if there was an [Interruption] *)
if !current_pos = Array.length current_array then begin
begin try
write (Cmd_array(Array.copy current_array));
current_pos := 0;
false
with
Interruption ->
current_pos := 0;
true
end;
end
else false
in
let finish() =
try
write (Cmd_array(Array.sub current_array 0 !current_pos))
with
Interruption -> () (* Ignore here *)
in
let next_ex_number = ref 0 in
let ex_hash = Hashtbl.create 100 in
let next_att_number = ref 0 in
let att_hash = Hashtbl.create 100 in
let rec do_subtree n : 'ext job list = (
(* Marshals some information, and returns the rest of what is to do
* as job list
*)
match n # node_type with
T_data ->
write_nobreak (Start_data_node (recode(n#data)));
write_nobreak End_node;
[]
| T_element eltype ->
let eltype = recode eltype in
let pos = get_position n in
let atts =
(* remove all Implied_value; use att_hash *)
List.map
(fun (name,a) ->
let name = recode name in
let a = match a with
Implied_value -> a
| Value s -> Value(recode s)
| Valuelist l -> Valuelist(List.map recode l)
in
try (Hashtbl.find att_hash name, a)
with
Not_found ->
let nr = !next_att_number in
incr next_att_number;
Hashtbl.add att_hash name nr;
write_nobreak (Declare_attribute(nr, name));
(nr, a)
)
(List.filter
(fun (_,a) -> a <> Implied_value)
(n # attributes)
)
in
let ex_nr =
try
Hashtbl.find ex_hash eltype
with
Not_found ->
let nr = !next_ex_number in
incr next_ex_number;
Hashtbl.add ex_hash eltype nr;
write_nobreak (Declare_element_exemplar(nr,eltype));
nr
in
write_nobreak (Start_element_node (pos, ex_nr, atts));
do_pinstr n;
plan_subnodes n @ [ Marshal_cmd End_node ]
| T_super_root ->
write_nobreak (Start_super_root_node (get_position n));
do_pinstr n;
plan_subnodes n @ [ Marshal_cmd End_node ]
| T_pinstr target ->
let pos = get_position n in
let l = n # pinstr target in
(match l with
[ pi ] ->
write_nobreak (Start_pinstr_node (pos, recode target, recode pi#value));
write_nobreak End_node;
[]
| _ ->
assert false
)
| T_comment ->
let pos = get_position n in
let comment' =
match n#comment with
None -> None
| Some c -> Some(recode c)
in
write_nobreak (Start_comment_node (pos, comment'));
write_nobreak End_node;
[]
| _ ->
assert false
)
and do_pinstr n =
let names = n # pinstr_names in
List.iter
(fun name ->
let pinstrs = n # pinstr name in
List.iter
(fun pi ->
write_nobreak (Add_pinstr (recode pi#target, recode pi#value));
)
pinstrs
)
names
and plan_subnodes n =
List.map (fun sn -> Marshal_node sn) n#sub_nodes
and get_position n =
if omit_positions then
None
else
let entity, line, column = n # position in
if line = 0 then None else Some (recode entity,line,column)
in
let emit_namespace_mappings () =
(* TODO: In general, too many mappings are emitted, even mappings
* which are unused in the processed subtree.
* It would also be possible to emit a mapping just before it is
* used.
*)
try
let mng = n # namespace_manager in
mng # iter_namespaces
(fun normprefix ->
let uris = Array.of_list (mng # get_uri_list normprefix) in
let uris' = Array.map recode uris in
write_nobreak (Namespace_mapping (recode normprefix, uris'))
);
()
with
Namespace_method_not_applicable _ ->
()
in
let rec exec_job_list jl =
match jl with
[] ->
finish();
Done
| Marshal_cmd cmd :: jl' ->
if breakpoint() then
Task(fun () -> exec_job_list jl)
else begin
write_nobreak cmd;
exec_job_list jl'
end
| Marshal_node node :: jl' ->
let plan = do_subtree node in
exec_job_list (plan @ jl')
in
emit_namespace_mappings();
exec_job_list [Marshal_node n]
;;
let subtree_to_cmd_sequence ?(omit_positions=false) ?enc f n =
let enc, recode = match enc with
None -> (n#encoding :> encoding), id
| Some e -> e, (recode_string
~in_enc:(n#encoding :> encoding)
~out_enc:e)
in
let encname = Netconversion.string_of_encoding enc in
let sa = n#dtd#standalone_declaration in
f(Head(encname,sa));
let r = subtree_to_cmd_sequence_nohead ~omit_positions ~recode f n in
assert(r = Done)
;;
let subtree_to_channel ?(omit_positions = false) ?enc ch n =
subtree_to_cmd_sequence
~omit_positions:omit_positions
?enc
(fun cmd ->
Marshal.to_channel ch cmd [ Marshal.No_sharing ]
)
n
;;
let subtree_from_cmd_sequence_nohead ~recode f0 dtd spec =
let current_array = ref( [| |] ) in
let current_pos = ref 0 in
let rec f() =
if !current_pos < Array.length !current_array then begin
incr current_pos;
!current_array.( !current_pos - 1)
end
else begin
let c = f0() in
match c with
Cmd_array a ->
current_array := a;
current_pos := 0;
f()
| _ ->
c
end
in
let recode_pos =
function
None -> None
| Some (pos_e,pos_l,pos_p) -> Some (recode pos_e,pos_l,pos_p)
in
let default = get_data_exemplar spec in
let eltypes = ref (Array.create 100 ("",default)) in
let atts = ref (Array.create 100 "") in
let mng = new namespace_manager in
let mng_found = ref false in
let enable_mng, dest_mng =
try
true, dtd # namespace_manager
with
Namespace_method_not_applicable _ ->
false, mng (* value does not matter *)
in
let map_nsprefix name =
let p, l = namespace_split name in
if p = "" then
name
else begin
(* p is mapped to the primary URI of mng, and then mapped
* to the normprefix of dest_mng
*)
try
let primary_uri = mng # get_primary_uri p in
let normprefix =
dest_mng # lookup_or_add_namespace p primary_uri in
normprefix ^ ":" ^ l
with
Not_found ->
failwith "Pxp_marshal.subtree_from_cmd_sequence: Found an undeclared namespace prefix"
end
in
let rec read_node dont_add first_cmd =
let n =
match first_cmd with
Start_data_node data ->
create_data_node spec dtd (recode data)
| Declare_element_exemplar (nr, eltype) ->
let eltype = recode eltype in
if nr > Array.length !eltypes then begin
eltypes :=
Array.append !eltypes (Array.create 100 ("",default));
end;
let eltype' =
if !mng_found then
(* Map the namespace prefix to a working normprefix *)
map_nsprefix eltype
else
eltype
in
!eltypes.(nr) <- (eltype',
get_element_exemplar spec eltype []);
read_node true (f())
| Declare_attribute (nr, name) ->
let name = recode name in
if nr > Array.length !atts then begin
atts :=
Array.append !atts (Array.create 100 "");
end;
let name' =
if !mng_found then
map_nsprefix name
else
name
in
!atts.(nr) <- name';
read_node true (f())
| Start_element_node (pos, nr, a) ->
let pos = recode_pos pos in
let eltype, ex = !eltypes.(nr) in
(* -- saves 4% time, but questionable approach:
ex # create_element
?position:pos
dtd
(T_element eltype)
atts
*)
let a' =
List.map
(fun (nr, v) ->
let v' = match v with
Implied_value -> Implied_value
| Value s -> Value(recode s)
| Valuelist l -> Valuelist(List.map recode l)
in
!atts.(nr), v'
)
a in
let e =
create_element_node
~att_values: a'
?position:pos
spec
dtd
eltype
[] in
e
| Start_super_root_node pos ->
let pos = recode_pos pos in
create_super_root_node ?position:pos spec dtd
| Start_comment_node (pos, comment) ->
let pos = recode_pos pos in
(match comment with
Some c ->
create_comment_node
?position:pos
spec
dtd
(recode c)
| None ->
let cn = create_comment_node
?position:pos
spec
dtd
"" in
cn # set_comment None;
cn
)
| Start_pinstr_node (pos, target, value) ->
let pos = recode_pos pos in
create_pinstr_node
?position:pos
spec
dtd
(new proc_instruction (recode target) (recode value) (dtd # encoding))
| Namespace_mapping (normprefix, uris) ->
let normprefix = recode normprefix in
if enable_mng then begin
let primary_uri = recode uris.( Array.length uris - 1 ) in
if normprefix <> "xml" then
mng # add_namespace normprefix primary_uri;
for i=0 to Array.length uris - 2 do
mng # add_uri normprefix (recode uris.(i))
done;
mng_found := true;
end;
read_node true (f())
| _ ->
assert false
in
let rec add () =
match f () with
(Start_data_node _ |
Start_element_node (_,_,_) |
Start_super_root_node _ |
Start_comment_node (_,_) |
Start_pinstr_node (_,_,_) |
(* Declare_xxx is always followed by Start_element_node: *)
Declare_element_exemplar(_,_) |
Declare_attribute(_,_)) as cmd ->
(* Add a new sub node *)
let n' = read_node false cmd in
n # add_node ~force:true n';
add()
| Add_pinstr(target,value) ->
let pi = new proc_instruction
(recode target) (recode value) (dtd # encoding) in
n # add_pinstr pi;
add()
| End_node ->
()
| _ ->
failwith "Pxp_marshal.subtree_from_cmd_sequence"
in
if not dont_add then
add();
n
in
read_node false (f())
;;
let subtree_from_cmd_sequence f dtd spec =
match f() with
Head(enc_s,_) ->
let enc = Netconversion.encoding_of_string enc_s in
let recode =
if (dtd # encoding :> encoding) = enc then
id
else
recode_string
~in_enc:enc
~out_enc:(dtd # encoding :> encoding)
in
subtree_from_cmd_sequence_nohead ~recode f dtd spec
| _ ->
failwith "Pxp_marshal.subtree_from_cmd_sequence"
;;
let subtree_from_channel ch dtd spec =
try
subtree_from_cmd_sequence
(fun () ->
Marshal.from_channel ch)
dtd
spec
with
End_of_file ->
failwith "Pxp_marshal.subtree_from_channel"
;;
let document_to_cmd_sequence ?(omit_positions = false) ?enc f
(doc : 'ext Pxp_document.document) =
let enc =
match enc with
None -> (doc # encoding :> encoding)
| Some e -> e
in
let encname = Netconversion.string_of_encoding enc in
let recode = recode_string
~in_enc:(doc # encoding :> encoding)
~out_enc:enc in
let sa = doc # dtd # standalone_declaration in
f (Head (encname, sa));
f (Document (doc # xml_version));
let dtd_buffer = Buffer.create 1000 in
doc # dtd # write (`Out_buffer dtd_buffer) enc false;
let r =
match doc # dtd # root with
None -> ""
| Some x -> recode x
in
let id =
match doc # dtd # id with
None -> Internal
| Some x -> x
(* Do not need to recode because DTD IDs are always UTF-8 *)
in
f (DTD_string (r,
id,
Buffer.contents dtd_buffer));
List.iter
(fun pi_name ->
List.iter
(fun pi ->
f (Add_pinstr (recode pi#target, recode pi#value))
)
(doc # pinstr pi_name)
)
(doc # pinstr_names);
f Root;
let r = subtree_to_cmd_sequence_nohead
~omit_positions
~recode
f
(doc # root) in
assert(r=Done)
;;
let document_to_channel ?(omit_positions = false) ?enc ch doc =
document_to_cmd_sequence
~omit_positions:omit_positions
?enc
(fun cmd ->
Marshal.to_channel ch cmd [ Marshal.No_sharing ]
)
doc
;;
let document_from_cmd_sequence f config spec =
let cmd0 = f() in
let enc_s, sa =
match cmd0 with
Head(e,s) -> e,s
| _ -> failwith "Pxp_marshal.document_from_cmd_sequence"
in
let enc = Netconversion.encoding_of_string enc_s in
let recode =
if enc = (config.encoding :> encoding) then
id
else
recode_string
~in_enc:enc
~out_enc:(config.encoding :> encoding)
in
let cmd1 = f() in
let xml_version =
match cmd1 with
Document v -> v
| _ -> failwith "Pxp_marshal.document_from_cmd_sequence"
in
let cmd2 = f() in
let root_type, id, dtd_string =
match cmd2 with
DTD_string(r,i,s) -> r,i,s
| _ -> failwith "Pxp_marshal.document_from_cmd_sequence"
in
let dtd =
Pxp_yacc.parse_dtd_entity
config
(Pxp_yacc.from_string
~fixenc:enc
dtd_string) in
if root_type <> "" then dtd # set_root (recode root_type);
dtd # set_id id;
dtd # set_standalone_declaration sa;
let doc = new Pxp_document.document config.Pxp_yacc.warner config.encoding in
doc # init_xml_version xml_version;
let cmd = ref (f()) in
while !cmd <> Root do
( match !cmd with
Add_pinstr(target,value) ->
let pi = new proc_instruction
(recode target) (recode value) config.encoding in
doc # add_pinstr pi
| _ ->
failwith "Pxp_marshal.document_from_cmd_sequence"
);
cmd := f();
done;
let root =
subtree_from_cmd_sequence_nohead
~recode
f dtd spec in
doc # init_root root (recode root_type);
doc
;;
let document_from_channel ch config spec =
try
document_from_cmd_sequence
(fun () ->
Marshal.from_channel ch)
config
spec
with
End_of_file ->
failwith "Pxp_marshal.document_from_channel"
;;
let relocate_subtree tree new_dtd new_spec =
let remaining_job = ref Done in
let available_cmds = Queue.create() in
let continue() =
match !remaining_job with
Done ->
assert false
| Task f ->
remaining_job := f()
in
let encname = Netconversion.string_of_encoding(tree#encoding :> encoding) in
let sa = tree#dtd#standalone_declaration in
Queue.add (Head(encname,sa)) available_cmds;
remaining_job :=
subtree_to_cmd_sequence_nohead
~omit_positions:false ~recode:id
(fun cmd ->
Queue.add cmd available_cmds;
raise Interruption
)
tree;
let rec next_cmd() =
let n_cmd =
try Some(Queue.take available_cmds) with Queue.Empty -> None in
match n_cmd with
None ->
continue();
assert(ignore(Queue.peek available_cmds); true);
next_cmd()
| Some cmd ->
cmd
in
let tree' =
subtree_from_cmd_sequence
next_cmd
new_dtd
new_spec in
assert(!remaining_job = Done);
tree'
;;
let relocate_document (doc : 'ext document) new_conf new_spec =
let recode =
recode_string
~in_enc:(doc # encoding :> encoding)
~out_enc:(new_conf.encoding :> encoding)
in
(* Relocate the DTD: *)
let dtd = doc # dtd in
let buf = Buffer.create 128 in
let enc = (new_conf.encoding :> encoding) in
dtd # write (`Out_buffer buf) enc false;
let new_dtd = parse_dtd_entity
new_conf (from_string ~fixenc:enc (Buffer.contents buf)) in
(* The following properties of the DTD do not survive a write/parse cycle: *)
if dtd # arbitrary_allowed then new_dtd # allow_arbitrary;
List.iter
(fun eltype ->
if (dtd # element eltype) # arbitrary_allowed then
(new_dtd # element eltype) # allow_arbitrary
)
(dtd # element_names);
new_dtd # set_standalone_declaration (dtd # standalone_declaration);
( match dtd # id with
None -> ()
| Some id -> new_dtd # set_id id;
);
( match dtd # root with
None -> ()
| Some r -> new_dtd # set_root (recode r)
);
(* Note: namespace_manager is set according to new_conf *)
(* Relocate the XML tree: *)
let new_root = relocate_subtree doc#root new_dtd new_spec in
(* Create a new document containing the new DTD and the new XML tree: *)
let new_doc = new document new_conf.warner new_conf.encoding in
new_doc # init_xml_version (doc # xml_version);
let root_name = match new_dtd # root with
Some rn -> rn
| None -> failwith "Pxp_marshal.relocate_document"
in
new_doc # init_root new_root root_name;
new_doc
;;
(* ======================================================================
* History:
*
* $Log: pxp_marshal.ml,v $
* Revision 1.9 2002/03/10 23:40:30 gerd
* It is now possible to change the character encoding when
* marshalling.
* New: relocate_subtree, relocate_document.
*
* Revision 1.8 2001/10/12 21:38:14 gerd
* Changes for O'caml 3.03-alpha.
*
* Revision 1.7 2001/06/30 00:05:12 gerd
* Fix: When checking the type of the root element, namespace
* rewritings are taken into account.
*
* Revision 1.6 2001/06/28 22:42:07 gerd
* Fixed minor problems:
* - Comments must be contained in one entity
* - Pxp_document.document is now initialized with encoding.
* the DTD encoding may be initialized too late.
*
* Revision 1.5 2001/06/27 23:33:53 gerd
* Type output_stream is now a polymorphic variant
*
* Revision 1.4 2001/06/08 01:15:47 gerd
* Moved namespace_manager from Pxp_document to Pxp_dtd. This
* makes it possible that the DTD can recognize the processing instructions
* <?pxp:dtd namespace prefix="..." uri="..."?>, and add the namespace
* declaration to the manager.
*
* Revision 1.3 2001/06/07 22:46:15 gerd
* Revised set of reconstruction commands:
* - Head: Contains the encoding
* - Declare_attributes: there is a name pool for attributes now
* - Start_element_node: uses att_value to represent attributes
* - DTD_string: contains the name of the root element, the DTD ID
* - Root: a delimiter before the root element begins
* - Namespace_mapping: represents the namespace manager
* As the Head command contains the encoding, it is checked whether
* the marshalling functions are called with the same encoding. Otherwise
* the functions fail.
* Fixed: The name of the root element, and the DTD ID are represented.
* Namespaces are supported.
* QA: This revision passes some regression tests (codewriter dir)
*
* Revision 1.2 2000/09/21 21:30:23 gerd
* Optimization: repeated elements are created from
* shared exemplars.
*
* Revision 1.1 2000/09/17 00:10:31 gerd
* Initial revision.
*
*
*)