Plasma GitLab Archive
Projects Blog Knowledge

(* $Id: netulex.ml 799 2004-07-08 23:04:25Z stolpmann $
 * ----------------------------------------------------------------------
 * PXP: The polymorphic XML parser for Objective Caml.
 * Copyright by Gerd Stolpmann. See LICENSE for details.
 *)


module ULB = struct
  open Netaux.ArrayAux
  open Netconversion

  type unicode_lexbuf =
	{ mutable ulb_encoding : encoding;
	  mutable ulb_encoding_start : int;
	  mutable ulb_rawbuf : string;
	  mutable ulb_rawbuf_len : int;
	  mutable ulb_rawbuf_end : int;
	  mutable ulb_rawbuf_const : bool;
	  mutable ulb_chars : int array;
	  mutable ulb_chars_pos : int array;
	  mutable ulb_chars_len : int;
	  mutable ulb_eof : bool;
	  mutable ulb_refill : string -> int -> int -> int;
	  mutable ulb_enc_change_hook : unicode_lexbuf -> unit;
	  mutable ulb_cursor : cursor
	}

  let from_function ?(raw_size = 512) ?(char_size = 250) 
                    ?(enc_change_hook = fun _ -> ())
                    ~refill enc =
    { ulb_encoding = enc;
      ulb_encoding_start = 0;
      ulb_rawbuf = String.create raw_size;
      ulb_rawbuf_len = 0;
      ulb_rawbuf_end = 0;
      ulb_rawbuf_const = false;
      ulb_chars = Array.make char_size (-1);
      ulb_chars_pos = ( let cp = Array.make (char_size+1) (-1) in
			cp.(0) <- 0;
			cp );
      ulb_chars_len = 0;
      ulb_eof = false;
      ulb_refill = refill;
      ulb_enc_change_hook = enc_change_hook;
      ulb_cursor = create_cursor enc "";
    }

  let from_in_obj_channel ?raw_size ?char_size ?enc_change_hook enc inch =
    let refill s k l =
      try
	let n = inch # input s k l in
	if n=0 then
	  failwith "Netulex.ULB.from_in_obj_channel: non-blocking channel";
	n
      with
	  End_of_file -> 0
    in
    from_function ?raw_size ?char_size ?enc_change_hook ~refill enc

  let from_string ?(enc_change_hook = fun _ -> ()) enc s =
    let char_size = 250 in
    { ulb_encoding = enc;
      ulb_encoding_start = 0;
      ulb_rawbuf = String.copy s;
      ulb_rawbuf_len = String.length s;
      ulb_rawbuf_end = 0;
      ulb_rawbuf_const = true;
      ulb_chars = Array.make char_size (-1);
      ulb_chars_pos = ( let cp = Array.make (char_size+1) (-1) in
			cp.(0) <- 0;
			cp );
      ulb_chars_len = 0;
      ulb_eof = true;
      ulb_refill = (fun _ _ _ -> assert false);
      ulb_enc_change_hook = enc_change_hook;
      ulb_cursor = create_cursor enc "";
    }

  let from_string_inplace ?(enc_change_hook = fun _ -> ()) enc s =
    let char_size = 250 in
    { ulb_encoding = enc;
      ulb_encoding_start = 0;
      ulb_rawbuf = s;
      ulb_rawbuf_len = String.length s;
      ulb_rawbuf_end = 0;
      ulb_rawbuf_const = true;
      ulb_chars = Array.make char_size (-1);
      ulb_chars_pos = ( let cp = Array.make (char_size+1) (-1) in
			cp.(0) <- 0;
			cp );
      ulb_chars_len = 0;
      ulb_eof = true;
      ulb_refill = (fun _ _ _ -> assert false);
      ulb_enc_change_hook = enc_change_hook;
      ulb_cursor = create_cursor enc "";
    }

  let delete n ulb =
    if n < 0 || n > ulb.ulb_chars_len then
      invalid_arg "Netulex.ULB.delete";
    let m = ulb.ulb_chars_len - n in
    int_blit ulb.ulb_chars n ulb.ulb_chars 0 m;
    int_blit ulb.ulb_chars_pos n ulb.ulb_chars_pos 0 (m+1);

    if not ulb.ulb_rawbuf_const then (
      let k = ulb.ulb_chars_pos.(0) in
      assert (ulb.ulb_rawbuf_end >= k);
      let m' = ulb.ulb_rawbuf_len - k in
      String.blit ulb.ulb_rawbuf k ulb.ulb_rawbuf 0 m';
      let cp = ulb.ulb_chars_pos in
      for i = 0 to m do
	cp.(i) <- cp.(i) - k
      done;

      ulb.ulb_rawbuf_len <- m';
      ulb.ulb_rawbuf_end <- ulb.ulb_rawbuf_end - k;
    );

    ulb.ulb_chars_len  <- m;
    ulb.ulb_encoding_start <- max 0 (ulb.ulb_encoding_start - n)
      
  let set_encoding enc ulb =
    if enc <> ulb.ulb_encoding then (
      ulb.ulb_encoding <- enc;
      ulb.ulb_encoding_start <- ulb.ulb_chars_len;
      ulb.ulb_enc_change_hook ulb
    )

  let close ulb =
    ulb.ulb_eof <- true

  let utf8_sub_string k n ulb =
    if k < 0 || k > ulb.ulb_chars_len || n < 0 || k+n > ulb.ulb_chars_len then
      invalid_arg "Netulex.ULB.utf8_sub_string";

    if ulb.ulb_encoding = `Enc_utf8 && k >= ulb.ulb_encoding_start then (
      (* Extract the substring from [ulb_rawbuf] ! *)
      let k' = ulb.ulb_chars_pos.(k) in
      let n' = ulb.ulb_chars_pos.(k+n) - k' in
      String.sub ulb.ulb_rawbuf k' n'
    )
    else (
      (* Create the UTF-8 string from [ulb_chars] *)
      ustring_of_uarray `Enc_utf8 ~pos:k ~len:n ulb.ulb_chars
    )

  let utf8_sub_string_length k n ulb =
    if k < 0 || k > ulb.ulb_chars_len || n < 0 || k+n > ulb.ulb_chars_len then
      invalid_arg "Netulex.ULB.utf8_sub_string_length";

    if ulb.ulb_encoding = `Enc_utf8 && k >= ulb.ulb_encoding_start then (
      (* Extract the substring from [ulb_rawbuf] ! *)
      let k' = ulb.ulb_chars_pos.(k) in
      let n' = ulb.ulb_chars_pos.(k+n) - k' in
      n'
    )
    else (
      (* Count the UTF-8 string from [ulb_chars] *)
      (* Maybe better algorithm: divide into several slices, and call
       * ustring_of_uarray for them. Goal: Reduction of memory allocation
       *)
      let conv = ustring_of_uchar `Enc_utf8 in
      let n' = ref 0 in
      for i = k to k+n-1 do
	n' := !n' + String.length (conv ulb.ulb_chars.(i))
      done;
      !n'
    )


  let rec refill_aux ulb = 
    (* Check whether we cannot add at least one byte to [ulb_chars] because
     * of EOF:
     *)
    if ulb.ulb_eof && ulb.ulb_rawbuf_len = ulb.ulb_rawbuf_end then
      0
    else (

      (* Enlarge [ulb_chars] if necessary (need at least space for one character)
       *)
      if ulb.ulb_chars_len >= Array.length ulb.ulb_chars then (
	let n = min (Sys.max_array_length-1) (2 * (Array.length ulb.ulb_chars)) in
	if n = Array.length ulb.ulb_chars then
	  failwith "Netulex.ULB.refill: array too large";
	
	let c = Array.make n (-1) in
	let cp = Array.make (n+1) (-1) in
	int_blit ulb.ulb_chars 0 c 0 ulb.ulb_chars_len;
	int_blit ulb.ulb_chars_pos 0 cp 0 (ulb.ulb_chars_len+1);
	
	ulb.ulb_chars <- c;
	ulb.ulb_chars_pos <- cp;
      );
      
      (* If there is unanalysed material in [ulb_rawbuf], try to convert it.
       * It may happen, however, that there is only the beginning of a 
       * multi-byte character, so this may not add any new character.
       *)
      let new_chars =
	if ulb.ulb_rawbuf_end < ulb.ulb_rawbuf_len then (
	  let cs = ulb.ulb_cursor in
	  reinit_cursor
	    ~range_pos:ulb.ulb_rawbuf_end
	    ~range_len:(ulb.ulb_rawbuf_len - ulb.ulb_rawbuf_end)
	    ~enc:ulb.ulb_encoding
	    ulb.ulb_rawbuf
	    cs;
	  let counter = ref 0 in
	  ( try
	      while ulb.ulb_chars_len < Array.length ulb.ulb_chars do
		let space = Array.length ulb.ulb_chars - ulb.ulb_chars_len in
		(* cursor_blit may raise End_of_string, too *)
		let n = cursor_blit 
			  cs ulb.ulb_chars ulb.ulb_chars_len space in
		let n' = cursor_blit_positions
			   cs ulb.ulb_chars_pos ulb.ulb_chars_len space in
		assert(n=n');
		if n>0 then (
		  ulb.ulb_chars_len <- ulb.ulb_chars_len+n;
		  counter := !counter + n;
		  move ~num:n cs; (* may raise Malformed_code *)
		) 
		else (
		  (* We are at a special position in the string! *)
		  try ignore(uchar_at cs); assert false
		  with
		      Byte_order_mark ->
			(* Skip the BOM: *)
			move cs   (* may raise Malformed_code *)
			(* Note: this [move] does not count *)
		    | Partial_character ->
			(* Stop here *)
			raise Exit
		    (* End_of_string: already handled *)
		)
	      done
	    with
		Exit ->
		  ()
	      | End_of_string ->
		  ()
	  );
	  
	  let e = cursor_pos cs; in
	  ulb.ulb_chars_pos.(ulb.ulb_chars_len) <- e;
	  ulb.ulb_rawbuf_end <- e;

	  (* Encoding might have changed: *)
	  set_encoding (cursor_encoding cs) ulb;
	
	  !counter
	)
	else
	  0
      in
      
      (* In the case we still did not add any char: Check if we are near
       * EOF (the last multi-byte character is not complete).
       *)
      if new_chars = 0 then (
	if ulb.ulb_eof then raise Malformed_code;

	assert(not ulb.ulb_rawbuf_const);

	(* Now try to get new data into [ulb_rawbuf]. First, we check whether
	 * we have enough free space in this buffer. We insist on at least
	 * 50 bytes (quite arbitrary...). Then call the [ulb_refill] function
	 * to get the data.
	 *)
	if ulb.ulb_rawbuf_len + 50 >= String.length ulb.ulb_rawbuf then (
	  let n = min Sys.max_string_length (2 * (String.length ulb.ulb_rawbuf)) in
	  if n = String.length ulb.ulb_rawbuf then
	    failwith "Netulex.ULB.refill: string too large";
	  
	  let s = String.create n in
	  String.blit ulb.ulb_rawbuf 0 s 0 ulb.ulb_rawbuf_len;
	  ulb.ulb_rawbuf <- s;
	);
	
	(* Call now [ulb_refill]. If we detect EOF, record this. Anyway,
	 * start over.
	 *)
	let space = (String.length ulb.ulb_rawbuf) - ulb.ulb_rawbuf_len in
	let n = ulb.ulb_refill ulb.ulb_rawbuf ulb.ulb_rawbuf_len space in
	assert(n>=0);
	if n=0 then (
	  (* EOF *)
	  ulb.ulb_eof <- true;
	)
	else (
	  ulb.ulb_rawbuf_len <- ulb.ulb_rawbuf_len + n
	);
	
	refill_aux ulb
      )
      else
	new_chars
    )

  let refill ulb =
    let n = refill_aux ulb in
    assert(n>=0);
    if n=0 then (
      assert(ulb.ulb_eof);
      assert(ulb.ulb_rawbuf_len = ulb.ulb_rawbuf_end);
      raise End_of_file
    )

end


module Ulexing = struct
  type lexbuf =
      { ulb : ULB.unicode_lexbuf;
	mutable offset : int;
	mutable pos : int;
	mutable start : int;
	mutable marked_pos : int;
	mutable marked_val : int;
      }

  exception Error

  let from_ulb_lexbuf ulb =
    { ulb = ulb;
      offset = 0;
      pos = 0;
      start = 0;
      marked_pos = 0;
      marked_val = 0;
    }

  let lexeme_start lexbuf = lexbuf.start + lexbuf.offset
  let lexeme_end lexbuf = lexbuf.pos + lexbuf.offset
  let lexeme_length lexbuf = lexbuf.pos - lexbuf.start

  let lexeme lexbuf =
    let buf = lexbuf.ulb.ULB.ulb_chars in
    Array.sub buf lexbuf.start (lexbuf.pos - lexbuf.start)

  let sub_lexeme lexbuf pos len =
    let buf = lexbuf.ulb.ULB.ulb_chars in
    Array.sub buf (lexbuf.start + pos) len

  let lexeme_char lexbuf pos =
    let buf = lexbuf.ulb.ULB.ulb_chars in
    buf.(lexbuf.start + pos)

  let utf8_lexeme lexbuf =
    ULB.utf8_sub_string lexbuf.start (lexbuf.pos - lexbuf.start) lexbuf.ulb

  let utf8_sub_lexeme lexbuf pos len =
    ULB.utf8_sub_string (lexbuf.start + pos) len lexbuf.ulb

  let utf8_sub_lexeme_length lexbuf pos len =
    ULB.utf8_sub_string_length (lexbuf.start + pos) len lexbuf.ulb
    
  (* "Internal" interface *)
  let start lexbuf =
    lexbuf.start <- lexbuf.pos;
    lexbuf.marked_pos <- lexbuf.pos;
    lexbuf.marked_val <- (-1)

  let mark  lexbuf i =
    lexbuf.marked_pos <- lexbuf.pos;
    lexbuf.marked_val <- i
      
  let backtrack lexbuf =
    lexbuf.pos <- lexbuf.marked_pos;
    lexbuf.marked_val

  let rollback lexbuf =
    lexbuf.pos <- lexbuf.start

  let eof = (-1)

  let refill lexbuf = 
    try
      (* Delete all characters in ulexbuf before the current lexeme: *)
      if lexbuf.start > 0 then (
	let n = lexbuf.start in
	ULB.delete n lexbuf.ulb;
	lexbuf.offset <- lexbuf.offset + n;
	lexbuf.pos <- lexbuf.pos - n;
	lexbuf.marked_pos <- lexbuf.marked_pos - n;
	lexbuf.start <- 0;
      );
      ULB.refill lexbuf.ulb;
      (* raises either End_of_file, or ensures there is one char in ulb *)
      lexbuf.ulb.ULB.ulb_chars.(lexbuf.pos)
    with
	End_of_file ->
	  (* We cannot modify the buffer as the original Ulexing implementation
	   *)
	  eof
	  

  let next lexbuf =
    let ulb = lexbuf.ulb in
    let i =
      if lexbuf.pos = ulb.ULB.ulb_chars_len then
	refill lexbuf
      else 
	ulb.ULB.ulb_chars.(lexbuf.pos)
    in
    if i <> eof then lexbuf.pos <- lexbuf.pos + 1;
    i

end

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml