(* $Id: netulex.ml 799 2004-07-08 23:04:25Z stolpmann $ * ---------------------------------------------------------------------- * PXP: The polymorphic XML parser for Objective Caml. * Copyright by Gerd Stolpmann. See LICENSE for details. *) module ULB = struct open Netaux.ArrayAux open Netconversion type unicode_lexbuf = { mutable ulb_encoding : encoding; mutable ulb_encoding_start : int; mutable ulb_rawbuf : string; mutable ulb_rawbuf_len : int; mutable ulb_rawbuf_end : int; mutable ulb_rawbuf_const : bool; mutable ulb_chars : int array; mutable ulb_chars_pos : int array; mutable ulb_chars_len : int; mutable ulb_eof : bool; mutable ulb_refill : string -> int -> int -> int; mutable ulb_enc_change_hook : unicode_lexbuf -> unit; mutable ulb_cursor : cursor } let from_function ?(raw_size = 512) ?(char_size = 250) ?(enc_change_hook = fun _ -> ()) ~refill enc = { ulb_encoding = enc; ulb_encoding_start = 0; ulb_rawbuf = String.create raw_size; ulb_rawbuf_len = 0; ulb_rawbuf_end = 0; ulb_rawbuf_const = false; ulb_chars = Array.make char_size (-1); ulb_chars_pos = ( let cp = Array.make (char_size+1) (-1) in cp.(0) <- 0; cp ); ulb_chars_len = 0; ulb_eof = false; ulb_refill = refill; ulb_enc_change_hook = enc_change_hook; ulb_cursor = create_cursor enc ""; } let from_in_obj_channel ?raw_size ?char_size ?enc_change_hook enc inch = let refill s k l = try let n = inch # input s k l in if n=0 then failwith "Netulex.ULB.from_in_obj_channel: non-blocking channel"; n with End_of_file -> 0 in from_function ?raw_size ?char_size ?enc_change_hook ~refill enc let from_string ?(enc_change_hook = fun _ -> ()) enc s = let char_size = 250 in { ulb_encoding = enc; ulb_encoding_start = 0; ulb_rawbuf = String.copy s; ulb_rawbuf_len = String.length s; ulb_rawbuf_end = 0; ulb_rawbuf_const = true; ulb_chars = Array.make char_size (-1); ulb_chars_pos = ( let cp = Array.make (char_size+1) (-1) in cp.(0) <- 0; cp ); ulb_chars_len = 0; ulb_eof = true; ulb_refill = (fun _ _ _ -> assert false); ulb_enc_change_hook = enc_change_hook; ulb_cursor = create_cursor enc ""; } let from_string_inplace ?(enc_change_hook = fun _ -> ()) enc s = let char_size = 250 in { ulb_encoding = enc; ulb_encoding_start = 0; ulb_rawbuf = s; ulb_rawbuf_len = String.length s; ulb_rawbuf_end = 0; ulb_rawbuf_const = true; ulb_chars = Array.make char_size (-1); ulb_chars_pos = ( let cp = Array.make (char_size+1) (-1) in cp.(0) <- 0; cp ); ulb_chars_len = 0; ulb_eof = true; ulb_refill = (fun _ _ _ -> assert false); ulb_enc_change_hook = enc_change_hook; ulb_cursor = create_cursor enc ""; } let delete n ulb = if n < 0 || n > ulb.ulb_chars_len then invalid_arg "Netulex.ULB.delete"; let m = ulb.ulb_chars_len - n in int_blit ulb.ulb_chars n ulb.ulb_chars 0 m; int_blit ulb.ulb_chars_pos n ulb.ulb_chars_pos 0 (m+1); if not ulb.ulb_rawbuf_const then ( let k = ulb.ulb_chars_pos.(0) in assert (ulb.ulb_rawbuf_end >= k); let m' = ulb.ulb_rawbuf_len - k in String.blit ulb.ulb_rawbuf k ulb.ulb_rawbuf 0 m'; let cp = ulb.ulb_chars_pos in for i = 0 to m do cp.(i) <- cp.(i) - k done; ulb.ulb_rawbuf_len <- m'; ulb.ulb_rawbuf_end <- ulb.ulb_rawbuf_end - k; ); ulb.ulb_chars_len <- m; ulb.ulb_encoding_start <- max 0 (ulb.ulb_encoding_start - n) let set_encoding enc ulb = if enc <> ulb.ulb_encoding then ( ulb.ulb_encoding <- enc; ulb.ulb_encoding_start <- ulb.ulb_chars_len; ulb.ulb_enc_change_hook ulb ) let close ulb = ulb.ulb_eof <- true let utf8_sub_string k n ulb = if k < 0 || k > ulb.ulb_chars_len || n < 0 || k+n > ulb.ulb_chars_len then invalid_arg "Netulex.ULB.utf8_sub_string"; if ulb.ulb_encoding = `Enc_utf8 && k >= ulb.ulb_encoding_start then ( (* Extract the substring from [ulb_rawbuf] ! *) let k' = ulb.ulb_chars_pos.(k) in let n' = ulb.ulb_chars_pos.(k+n) - k' in String.sub ulb.ulb_rawbuf k' n' ) else ( (* Create the UTF-8 string from [ulb_chars] *) ustring_of_uarray `Enc_utf8 ~pos:k ~len:n ulb.ulb_chars ) let utf8_sub_string_length k n ulb = if k < 0 || k > ulb.ulb_chars_len || n < 0 || k+n > ulb.ulb_chars_len then invalid_arg "Netulex.ULB.utf8_sub_string_length"; if ulb.ulb_encoding = `Enc_utf8 && k >= ulb.ulb_encoding_start then ( (* Extract the substring from [ulb_rawbuf] ! *) let k' = ulb.ulb_chars_pos.(k) in let n' = ulb.ulb_chars_pos.(k+n) - k' in n' ) else ( (* Count the UTF-8 string from [ulb_chars] *) (* Maybe better algorithm: divide into several slices, and call * ustring_of_uarray for them. Goal: Reduction of memory allocation *) let conv = ustring_of_uchar `Enc_utf8 in let n' = ref 0 in for i = k to k+n-1 do n' := !n' + String.length (conv ulb.ulb_chars.(i)) done; !n' ) let rec refill_aux ulb = (* Check whether we cannot add at least one byte to [ulb_chars] because * of EOF: *) if ulb.ulb_eof && ulb.ulb_rawbuf_len = ulb.ulb_rawbuf_end then 0 else ( (* Enlarge [ulb_chars] if necessary (need at least space for one character) *) if ulb.ulb_chars_len >= Array.length ulb.ulb_chars then ( let n = min (Sys.max_array_length-1) (2 * (Array.length ulb.ulb_chars)) in if n = Array.length ulb.ulb_chars then failwith "Netulex.ULB.refill: array too large"; let c = Array.make n (-1) in let cp = Array.make (n+1) (-1) in int_blit ulb.ulb_chars 0 c 0 ulb.ulb_chars_len; int_blit ulb.ulb_chars_pos 0 cp 0 (ulb.ulb_chars_len+1); ulb.ulb_chars <- c; ulb.ulb_chars_pos <- cp; ); (* If there is unanalysed material in [ulb_rawbuf], try to convert it. * It may happen, however, that there is only the beginning of a * multi-byte character, so this may not add any new character. *) let new_chars = if ulb.ulb_rawbuf_end < ulb.ulb_rawbuf_len then ( let cs = ulb.ulb_cursor in reinit_cursor ~range_pos:ulb.ulb_rawbuf_end ~range_len:(ulb.ulb_rawbuf_len - ulb.ulb_rawbuf_end) ~enc:ulb.ulb_encoding ulb.ulb_rawbuf cs; let counter = ref 0 in ( try while ulb.ulb_chars_len < Array.length ulb.ulb_chars do let space = Array.length ulb.ulb_chars - ulb.ulb_chars_len in (* cursor_blit may raise End_of_string, too *) let n = cursor_blit cs ulb.ulb_chars ulb.ulb_chars_len space in let n' = cursor_blit_positions cs ulb.ulb_chars_pos ulb.ulb_chars_len space in assert(n=n'); if n>0 then ( ulb.ulb_chars_len <- ulb.ulb_chars_len+n; counter := !counter + n; move ~num:n cs; (* may raise Malformed_code *) ) else ( (* We are at a special position in the string! *) try ignore(uchar_at cs); assert false with Byte_order_mark -> (* Skip the BOM: *) move cs (* may raise Malformed_code *) (* Note: this [move] does not count *) | Partial_character -> (* Stop here *) raise Exit (* End_of_string: already handled *) ) done with Exit -> () | End_of_string -> () ); let e = cursor_pos cs; in ulb.ulb_chars_pos.(ulb.ulb_chars_len) <- e; ulb.ulb_rawbuf_end <- e; (* Encoding might have changed: *) set_encoding (cursor_encoding cs) ulb; !counter ) else 0 in (* In the case we still did not add any char: Check if we are near * EOF (the last multi-byte character is not complete). *) if new_chars = 0 then ( if ulb.ulb_eof then raise Malformed_code; assert(not ulb.ulb_rawbuf_const); (* Now try to get new data into [ulb_rawbuf]. First, we check whether * we have enough free space in this buffer. We insist on at least * 50 bytes (quite arbitrary...). Then call the [ulb_refill] function * to get the data. *) if ulb.ulb_rawbuf_len + 50 >= String.length ulb.ulb_rawbuf then ( let n = min Sys.max_string_length (2 * (String.length ulb.ulb_rawbuf)) in if n = String.length ulb.ulb_rawbuf then failwith "Netulex.ULB.refill: string too large"; let s = String.create n in String.blit ulb.ulb_rawbuf 0 s 0 ulb.ulb_rawbuf_len; ulb.ulb_rawbuf <- s; ); (* Call now [ulb_refill]. If we detect EOF, record this. Anyway, * start over. *) let space = (String.length ulb.ulb_rawbuf) - ulb.ulb_rawbuf_len in let n = ulb.ulb_refill ulb.ulb_rawbuf ulb.ulb_rawbuf_len space in assert(n>=0); if n=0 then ( (* EOF *) ulb.ulb_eof <- true; ) else ( ulb.ulb_rawbuf_len <- ulb.ulb_rawbuf_len + n ); refill_aux ulb ) else new_chars ) let refill ulb = let n = refill_aux ulb in assert(n>=0); if n=0 then ( assert(ulb.ulb_eof); assert(ulb.ulb_rawbuf_len = ulb.ulb_rawbuf_end); raise End_of_file ) end module Ulexing = struct type lexbuf = { ulb : ULB.unicode_lexbuf; mutable offset : int; mutable pos : int; mutable start : int; mutable marked_pos : int; mutable marked_val : int; } exception Error let from_ulb_lexbuf ulb = { ulb = ulb; offset = 0; pos = 0; start = 0; marked_pos = 0; marked_val = 0; } let lexeme_start lexbuf = lexbuf.start + lexbuf.offset let lexeme_end lexbuf = lexbuf.pos + lexbuf.offset let lexeme_length lexbuf = lexbuf.pos - lexbuf.start let lexeme lexbuf = let buf = lexbuf.ulb.ULB.ulb_chars in Array.sub buf lexbuf.start (lexbuf.pos - lexbuf.start) let sub_lexeme lexbuf pos len = let buf = lexbuf.ulb.ULB.ulb_chars in Array.sub buf (lexbuf.start + pos) len let lexeme_char lexbuf pos = let buf = lexbuf.ulb.ULB.ulb_chars in buf.(lexbuf.start + pos) let utf8_lexeme lexbuf = ULB.utf8_sub_string lexbuf.start (lexbuf.pos - lexbuf.start) lexbuf.ulb let utf8_sub_lexeme lexbuf pos len = ULB.utf8_sub_string (lexbuf.start + pos) len lexbuf.ulb let utf8_sub_lexeme_length lexbuf pos len = ULB.utf8_sub_string_length (lexbuf.start + pos) len lexbuf.ulb (* "Internal" interface *) let start lexbuf = lexbuf.start <- lexbuf.pos; lexbuf.marked_pos <- lexbuf.pos; lexbuf.marked_val <- (-1) let mark lexbuf i = lexbuf.marked_pos <- lexbuf.pos; lexbuf.marked_val <- i let backtrack lexbuf = lexbuf.pos <- lexbuf.marked_pos; lexbuf.marked_val let rollback lexbuf = lexbuf.pos <- lexbuf.start let eof = (-1) let refill lexbuf = try (* Delete all characters in ulexbuf before the current lexeme: *) if lexbuf.start > 0 then ( let n = lexbuf.start in ULB.delete n lexbuf.ulb; lexbuf.offset <- lexbuf.offset + n; lexbuf.pos <- lexbuf.pos - n; lexbuf.marked_pos <- lexbuf.marked_pos - n; lexbuf.start <- 0; ); ULB.refill lexbuf.ulb; (* raises either End_of_file, or ensures there is one char in ulb *) lexbuf.ulb.ULB.ulb_chars.(lexbuf.pos) with End_of_file -> (* We cannot modify the buffer as the original Ulexing implementation *) eof let next lexbuf = let ulb = lexbuf.ulb in let i = if lexbuf.pos = ulb.ULB.ulb_chars_len then refill lexbuf else ulb.ULB.ulb_chars.(lexbuf.pos) in if i <> eof then lexbuf.pos <- lexbuf.pos + 1; i end