Plasma GitLab Archive
Projects Blog Knowledge

(* $Id$
 * ----------------------------------------------------------------------
 * PXP: The polymorphic XML parser for Objective Caml.
 * Copyright by Gerd Stolpmann. See LICENSE for details.
 *)

(** Support module for Alain Frisch's [ulex] lexer generator
 *
 * The sub module [ULB] is a Unicode-based lexing buffer that
 * reads encoded strings and makes them available to the lexer
 * as both Unicode arrays and UTF-8 strings.
 *
 * The sub module [Ulexing] is a replacement for the module
 * in [ulex] with the same name. It uses [ULB] to represent
 * the main lexing buffer. It is much faster than the original
 * [Ulexing] implementation when the scanned text is UTF-8
 * encoded and [Ulexing.utf8_lexeme] is frequently called to
 * get the lexeme strings. Furthermore, it can process input
 * data of all encodings available to [Netconversion]. It is,
 * however, no drop-in replacement as it has a different
 * signature.
 *
 * To enable this version of [Ulexing], simply put an
 * [open Netulex] before using the [ulex] lexers.
 *
 * Note that the tutorial has been moved to {!Netulex_tut}.
 *)

(** {1:modules Modules} *)


module ULB : sig
  (** This module provides the [unicode_lexbuf] record with
   * access functions. In this record, the data is available
   * in two forms: As an array of Unicode code points
   * [ulb_chars], and as string of encoded chars [ulb_rawbuf].
   * Both buffers are synchronised by [ulb_chars_pos]. This
   * array stores where every character of [ulb_chars] can be
   * found in [ulb_rawbuf].
   *)

  type unicode_lexbuf =
      private
	{ mutable ulb_encoding : Netconversion.encoding;
                (** The character encoding of [ulb_rawbuf] *)
	  mutable ulb_encoding_start : int;
	        (** The first character position to which [ulb_encoding]
		 * applies (the encoding of earlier positions is
		 * lost)
		 *)
	  mutable ulb_rawbuf : Bytes.t;
                (** The encoded string to analyse *)
	  mutable ulb_rawbuf_len : int;
                (** The filled part of [ulb_rawbuf] *)
	  mutable ulb_rawbuf_end : int;
                (** The analysed part of [ulb_rawbuf]. We have always
	         * [ulb_rawbuf_end <= ulb_rawbuf_len]. The analysed part
	         * may be shorter than the filled part because there is
	         * not enough space in [ulb_chars], or because the filled
	         * part ends with an incomplete multi-byte character
	         *)
	  mutable ulb_rawbuf_const : bool;
	        (** Whether [ulb_rawbuf] is considered as a constant. If
		 * [true], it is never blitted.
		 *)
	  mutable ulb_chars : int array;
                (** The analysed part of [ulb_rawbuf] as array of Unicode
	         * code points. Only the positions 0 to [ulb_chars_len-1]
	         * of the array are filled.
	         *)
	  mutable ulb_chars_pos : int array;
                (** For every analysed character this array stores the
   	         * byte position where the character begins in [ulb_rawbuf].
	         * In addition, the array contains at [ulb_chars_len] the
	         * value of [ulb_rawbuf_end].
		 *
	         * This array is one element longer than [ulb_chars].
	         *)
	  mutable ulb_chars_len : int;
                (** The filled part of [ulb_chars] *)
	  mutable ulb_eof : bool;
                (** Whether EOF has been seen *)
	  mutable ulb_refill : Bytes.t -> int -> int -> int;
	        (** The refill function *)
	  mutable ulb_enc_change_hook : unicode_lexbuf -> unit;
	        (** This function is called when the encoding changes *)
	  mutable ulb_cursor : Bytes.t Netconversion.poly_cursor;
	        (** Internally used by the implementation *)
	}

  val from_function : 
    ?raw_size:int -> 
    ?char_size:int -> 
    ?enc_change_hook:(unicode_lexbuf -> unit) ->
    refill:(Bytes.t -> int -> int -> int) ->
    Netconversion.encoding -> 
      unicode_lexbuf
  (** Creates a [unicode_lexbuf] to analyse strings of the 
   * passed [encoding] coming from the [refill] function.
   *
   * @param raw_size The initial size for [ulb_rawbuf]. Defaults to 512
   * @param char_size The initial size for [ulb_chars]. Defaults to 256
   * @param enc_change_hook This function is called when the encoding
   *   is changed, either by this module, or by the user calling
   *   [set_encoding].
   * @param refill This function is called with arguments [ulb_rawbuf],
   *   [ulb_rawbuf_len], and [l], where 
   *   [l = String.length ulb_rawbuf - ulb_rawbuf_len] is the free
   *   space in the buffer. The function should fill new bytes into
   *   this substring, and return the number of added bytes. The
   *   return value 0 signals EOF.
   *)

  val from_in_obj_channel :
    ?raw_size:int -> 
    ?char_size:int -> 
    ?enc_change_hook:(unicode_lexbuf -> unit) ->
    Netconversion.encoding -> 
    Netchannels.in_obj_channel ->
      unicode_lexbuf
  (** Creates a [unicode_lexbuf] to analyse strings of the 
   * passed [encoding] coming from the object channel.
   *
   * @param raw_size The initial size for [ulb_rawbuf]. Defaults to 512
   * @param char_size The initial size for [ulb_chars]. Defaults to 256
   * @param enc_change_hook This function is called when the encoding
   *   is changed, either by this module, or by the user calling
   *   [set_encoding].
   *)

  val from_string :
        ?enc_change_hook:(unicode_lexbuf -> unit) ->
        Netconversion.encoding -> string -> unicode_lexbuf
  (** Creates a [unicode_lexbuf] analysing the passed string encoded in
   * the passed encoding. This function copies the input string.
   *
   * @param enc_change_hook This function is called when the encoding
   *   is changed, either by this module, or by the user calling
   *   [set_encoding]
   *)

  val from_bytes :
        ?enc_change_hook:(unicode_lexbuf -> unit) ->
        Netconversion.encoding -> Bytes.t -> unicode_lexbuf
  (** Same for bytes *)

  val from_bytes_inplace :
        ?enc_change_hook:(unicode_lexbuf -> unit) ->
        Netconversion.encoding -> Bytes.t -> unicode_lexbuf
  (** Creates a [unicode_lexbuf] analysing the passed string encoded in
   * the passed encoding. This function does not copy the input string,
   * but uses it directly as [ulb_rawbuf]. The string is not modified by [ULB],
   * but the caller must ensure that other program parts do not
   * modify it either.
   *
   * @param enc_change_hook This function is called when the encoding
   *   is changed, either by this module, or by the user calling
   *   [set_encoding]
   *)

  (** Regarding [from_string_inplace], this function has been removed
      as strings are now considered immutable.
   *)

  val delete :
        int -> unicode_lexbuf -> unit
  (** Deletes the number of characters from [unicode_lexbuf].
   *  These characters
   *  are removed from the beginning of the buffer, i.e.
   *  [ulb_chars.(n)] becomes the new first character of the
   *  buffer. All three buffers [ulb_rawbuf], [ulb_chars], and
   *  [ulb_chars_pos] are blitted as necessary.       
   *
   *  When the buffer is already at EOF, the function fails.
   *
   *  For efficiency, it should be tried to call [delete] as seldom as
   *  possible. Its speed is linear to the number of characters to move.
   *)

  val refill :
	unicode_lexbuf ->
	  unit
  (** Tries to add characters to the [unicode_lexbuf] by calling the 
   * [ulb_refill] function. When the buffer is already at EOF, the 
   * exception [End_of_file] is raised, and the buffer is not modified.
   * Otherwise, the [ulb_refill] function is called to
   * add new characters. If necessary, [ulb_rawbuf], [ulb_chars], and
   * [ulb_chars_pos] are enlarged such that it is ensured that either
   * at least one new character is added, or that EOF is found for
   * the first time
   * In the latter case, [ulb_eof] is set to [true] (and the next call
   * of [refill_unicode_lexbuf] will raise [End_of_file]).
   *)

  val set_encoding :
        Netconversion.encoding -> unicode_lexbuf -> unit
   (** Sets the [encoding] to the passed value. This only affects future
    * [refill] calls. The hook [enc_change_hook] is invoked when defined.
    *)

  val close :
        unicode_lexbuf -> unit
  (** Sets [ulb_eof] of the [unicode_lexbuf]. The rest of the buffer
   * is not modified
   *)

  val utf8_sub_string : int -> int -> unicode_lexbuf -> string
    (** The two [int] arguments are the position and length of a sub
     * string of the lexbuf that is returned as UTF8 string. Position
     * and length are given as character multiples, not byte multiples.
     *)

  val utf8_sub_string_length : int -> int -> unicode_lexbuf -> int
    (** Returns [String.length(utf8_sub_string args)]. Tries not to
     * allocate the UTF-8 string.
     *)

end (* module ULB *)


module Ulexing : sig
  (** This is a lexing buffer for [ulex]. *)

  type lexbuf
  exception Error
    (** Lexical error *)

  val from_ulb_lexbuf : ULB.unicode_lexbuf -> lexbuf
    (** Creates a new [lexbuf] from the [unicode_lexbuf]. After that,
     * the [unicode_lexbuf] must no longer be modified.
     *)
  val lexeme_start: lexbuf -> int
    (** The character position of the start of the lexeme *)
  val lexeme_end: lexbuf -> int
    (** The character position of the end of the lexeme *)
  val lexeme_length: lexbuf -> int
    (** The length of the lexeme in characters *)
  val lexeme: lexbuf -> int array
    (** Returns the lexeme as array of Unicode code points *)
  val lexeme_char: lexbuf -> int -> int
    (** Returns the code point of a certain character of the
     * lexeme
     *)
  val sub_lexeme: lexbuf -> int -> int -> int array
    (** Returns a substring of the lexeme as array of Unicode
     * code points. The first [int] is the characater position
     * where to start, the second [int] is the number of 
     * characters.
     *)
  val utf8_lexeme: lexbuf -> string
    (** Returns the lexeme as UTF-8 encoded string *)
  val utf8_sub_lexeme: lexbuf -> int -> int -> string
    (** Returns a substring of the lexeme as UTF-8 encoded
     * string. The first [int] is the characater position
     * where to start, the second [int] is the number of
     * characters.
     *)              
  val utf8_sub_lexeme_length: lexbuf -> int -> int -> int
    (** Same as
     * String.length(utf8_sub_lexeme args), i.e. returns
     * the number of bytes a certain sub lexeme will have
     * when encoded as UTF-8 string.
     *)
    
  (**/**)

  (* "Internal" interface. This must match ulex's ones. *)
  val start: lexbuf -> unit
  val next: lexbuf -> int
  val mark: lexbuf -> int -> unit
  val backtrack: lexbuf -> int
end



This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml