Plasma GitLab Archive
Projects Blog Knowledge

(* $Id: xstr_match.mli,v 1.2 1999/07/04 20:02:08 gerd Exp $
 * ----------------------------------------------------------------------
 * Matching strings
 *)

(* Copyright 1999 by Gerd Stolpmann *)

type variable
   (* A 'variable' can record matched regions *)

type charset
   (* sets of characters *)

type matcher =
    Literal of string
  | Anystring 
  | Lazystring 
  | Anychar
  | Anystring_from of charset
  | Lazystring_from of charset
  | Anychar_from of charset
  | Nullstring
  | Alternative of matcher list list
  | Optional of matcher list
  | Record of (variable * matcher list)
  | Scanner of (string -> int)
;;

(* Literal s:            matches literally s and nothing else
 * Anystring/Lazystring  matches a string of arbitrary length with arbitrary
 *                       contents
 * Anystring_from s/    
 * Lazystring_from s     matches a string of arbitrary length with characters
 *                       from charset s
 * Anychar:              matches an arbitrary character
 * Anychar_from s:       matches a character from charset s
 * Nullstring:           matches the empty string
 * Alternative 
 *   [ ml1; ml2; ... ]
 *                    first tries the sequence ml1, then ml2, and so on
 *                    until one of the sequences leads to a match of the
 *                    whole string
 * Optional ml:       first tries the sequence ml, them the empty string.
 *                    = Alternative [ml; [Nullstring]]
 * Record (v, ml):    matches the same as ml, but the region of the string
 *                    is recorded in v
 * Scanner f:         f s is called where s is the rest to match. The function
 *                    should return the number of characters it can match,
 *                    or raise Not_found
 *)


val match_string : matcher list -> string -> bool

  (* match_string ml s:
   * Tries to match 'ml' against the string 's'; returns true on success, and
   * false otherwise.
   * As side-effect, the variables in 'ml' are set.
   * Matching proceeds from left to right, and for some of the matchers there
   * are particular matching orders. The first match that is found using
   * this order is returned (i.e. the variables get their values from this
   * match).
   * Notes:
   * - Anystring and Anystring_from are "greedy"; they try to match as much
   *   as possible.
   * - In contrast to this, Lazystring and Lazystring_from are "lazy"; they
   *   try to match as few as possible.
   * - Alternatives are tested from left to right.
   * - Options are first tested with argument, then with the empty string
   *   (i.e. "greedy")
   *)

type replacer =
    ReplaceLiteral of string
  | ReplaceVar of variable
  | ReplaceFunction of (unit -> string)
;;


type rflag =
    Anchored
  | Limit of int
  (* | RightToLeft *)
;;

val replace_matched_substrings : matcher list -> replacer list -> rflag list
                                  -> string -> (string * int)

  (* replace_matched_substrings ml rl fl s:
   *
   * All substrings of 's' are matched against 'ml' in turn, and all
   * non-overlapping matchings are replaced according 'rl'. The standard
   * behaviour is to test from left to right, and to replace all occurences
   * of substrings.
   * This can be modified by 'fl':
   *   - Anchored:  Not the substrings of 's', but only 's' itself is 
   *                matched against 'ml'. 
   *   - Limit n:   At most 'n' replacements will be done.
   *   - RightToLeft:  Begin with the rightmost matching; proceed with more
   *                   left matchings (NOT YET IMPLEMENTED!!!!)
   * The meaning of 'rl': Every matching is replaced by the sequence of
   * the elements of 'rl'.
   *   - ReplaceLiteral t:  Replace the string t
   *   - ReplaceVar v:      Replace the contents of 'v' or the empty string,
   *                        if v has no matching
   *   - ReplaceFunction f: Replace f(). You may raise Not_found or
   *        Match_failure to skip to the next matching.
   * 'replace_matched_substrings' returns the number of replacements.
   *)


val var : string -> variable

  (* var s: creates new variable with initial value s. If this variable
   * is used in a subsequent matching, and a value is found, the value
   * is overwritten; otherwise the old value persists.
   * - Initial vales are stored as references to strings
   * - Matched values are stored as triples (s,from,len) where 's' is the
   *   input string of the matching function
   *
   * [Note thread-safety: variables must not be shared by multiple threads.]
   *)

val var_matched  : variable -> bool

  (* returns true if the variable matched a value in the last match_string *)

val string_of_var : variable -> string

  (* returns the current value of the variable *)

val found_string_of_var : variable -> string

  (* returns the current value of the variable only if there was a match
   * for this variable in the last match_string; otherwise raise Not_found 
   *)

val mkset : string -> charset

  (* creates a set from readable description. The string simply enumerates
   * the characters of the set, and the notation "x-y" is possible, too.
   * To include '-' in the set, put it at the beginning or end.
   *)

val mknegset : string -> charset

  (* creates the complement that mkset would create *)


(* ---------------------------------------------------------------------- *)

(* EXAMPLE:
 *
 * let v = var "" in
 * let _ = match_string [ Literal "("; Record (v, [Anystring]); Literal ")" ]
 *                      s 
 * in found_string_of_var v
 *
 * - if s is "(abc)" returns "abc"
 * - if the parantheses are missing, raises Not_found
 *
 * VARIANT I:
 *
 * let v = var "" in
 * let _ = match_string [ Lazystring;
 *                        Literal "("; Record (v, [Lazystring]); Literal ")";
 *                        Anystring ]
 *                      s 
 * in found_string_of_var v
 *
 * - finds the first substring with parantheses, e.g.
 *   s = "abc(def)ghi(jkl)mno" returns "def"
 *
 * To get the last substring, swap Lazystring and Anystring at the beginning
 * resp. end.
 *
 * VARIANT II:
 *
 * let v = var "" in
 * let _ = match_string [ Lazystring;
 *                        Literal "("; Record (v, [Anystring]); Literal ")";
 *                        Anystring ]
 *                      s 
 * in found_string_of_var v
 *
 * - for s = "abc(def)ghi(jkl)mno" it is returned "def)ghi(jkl"
 *)

(* ---------------------------------------------------------------------- *)

(* EXAMPLE:
 *
 * let v = var "" in
 * let digits = mkset "0-9" in
 * let digits_re = [ Record(v, [ Anychar_from digits;  Anystring_from digits])]
 * in
 * replace_matched_substrings digits_re [ ReplaceLiteral "D" ] [] "ab012cd456fg"
 *
 * yields: ("abDcdDfg", 2)
 *
 * VARIANT I: 
 *
 * replace_matched_substrings digits_re [ ReplaceLiteral "D" ] 
 *                                      [ Limit 1 ] "ab012cd456fg"
 *
 * yields: ("abDcd456fg", 1)
 * 
 * VARIANT II:
 * 
 * replace_matched_substrings digits_re [ ReplaceLiteral "D" ] 
 *                                      [ Anchored ] "ab012cd456fg"
 *
 * yields: ("ab012cd456fg", 0)
 *
 * VARIANT III:
 * 
 * replace_matched_substrings digits_re [ ReplaceLiteral "D" ] 
 *                                      [ Anchored ] "012"
 *
 * yields: ("D", 1)
 *
 * VARIANT IV:
 * 
 * let f() = string_of_int(1+int_of_string(string_of_var v)) in
 * replace_matched_substrings digits_re [ ReplaceFunction f ] 
 *                                      [] "ab012cd456fg"
 *
 * yields: ("ab13cd457fg", 2)
 *)


(* ======================================================================
 * History:
 * 
 * $Log: xstr_match.mli,v $
 * Revision 1.2  1999/07/04 20:02:08  gerd
 * 	Added Lazystring, Lazystring_from.
 * 	Added replace_matched_substring function.
 * 	Changed the structure of 'variable'. 'sref' is either an arbitrary
 * string, or it is the input string of the matching function. 'from' and
 * 'len' are always used.
 *
 * Revision 1.1  1999/06/27 23:03:38  gerd
 * 	Initial revision.
 *
 * 
 *)

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml