(* $Id: xstr_match.mli,v 1.2 1999/07/04 20:02:08 gerd Exp $
* ----------------------------------------------------------------------
* Matching strings
*)
(* Copyright 1999 by Gerd Stolpmann *)
type variable
(* A 'variable' can record matched regions *)
type charset
(* sets of characters *)
type matcher =
Literal of string
| Anystring
| Lazystring
| Anychar
| Anystring_from of charset
| Lazystring_from of charset
| Anychar_from of charset
| Nullstring
| Alternative of matcher list list
| Optional of matcher list
| Record of (variable * matcher list)
| Scanner of (string -> int)
;;
(* Literal s: matches literally s and nothing else
* Anystring/Lazystring matches a string of arbitrary length with arbitrary
* contents
* Anystring_from s/
* Lazystring_from s matches a string of arbitrary length with characters
* from charset s
* Anychar: matches an arbitrary character
* Anychar_from s: matches a character from charset s
* Nullstring: matches the empty string
* Alternative
* [ ml1; ml2; ... ]
* first tries the sequence ml1, then ml2, and so on
* until one of the sequences leads to a match of the
* whole string
* Optional ml: first tries the sequence ml, them the empty string.
* = Alternative [ml; [Nullstring]]
* Record (v, ml): matches the same as ml, but the region of the string
* is recorded in v
* Scanner f: f s is called where s is the rest to match. The function
* should return the number of characters it can match,
* or raise Not_found
*)
val match_string : matcher list -> string -> bool
(* match_string ml s:
* Tries to match 'ml' against the string 's'; returns true on success, and
* false otherwise.
* As side-effect, the variables in 'ml' are set.
* Matching proceeds from left to right, and for some of the matchers there
* are particular matching orders. The first match that is found using
* this order is returned (i.e. the variables get their values from this
* match).
* Notes:
* - Anystring and Anystring_from are "greedy"; they try to match as much
* as possible.
* - In contrast to this, Lazystring and Lazystring_from are "lazy"; they
* try to match as few as possible.
* - Alternatives are tested from left to right.
* - Options are first tested with argument, then with the empty string
* (i.e. "greedy")
*)
type replacer =
ReplaceLiteral of string
| ReplaceVar of variable
| ReplaceFunction of (unit -> string)
;;
type rflag =
Anchored
| Limit of int
(* | RightToLeft *)
;;
val replace_matched_substrings : matcher list -> replacer list -> rflag list
-> string -> (string * int)
(* replace_matched_substrings ml rl fl s:
*
* All substrings of 's' are matched against 'ml' in turn, and all
* non-overlapping matchings are replaced according 'rl'. The standard
* behaviour is to test from left to right, and to replace all occurences
* of substrings.
* This can be modified by 'fl':
* - Anchored: Not the substrings of 's', but only 's' itself is
* matched against 'ml'.
* - Limit n: At most 'n' replacements will be done.
* - RightToLeft: Begin with the rightmost matching; proceed with more
* left matchings (NOT YET IMPLEMENTED!!!!)
* The meaning of 'rl': Every matching is replaced by the sequence of
* the elements of 'rl'.
* - ReplaceLiteral t: Replace the string t
* - ReplaceVar v: Replace the contents of 'v' or the empty string,
* if v has no matching
* - ReplaceFunction f: Replace f(). You may raise Not_found or
* Match_failure to skip to the next matching.
* 'replace_matched_substrings' returns the number of replacements.
*)
val var : string -> variable
(* var s: creates new variable with initial value s. If this variable
* is used in a subsequent matching, and a value is found, the value
* is overwritten; otherwise the old value persists.
* - Initial vales are stored as references to strings
* - Matched values are stored as triples (s,from,len) where 's' is the
* input string of the matching function
*
* [Note thread-safety: variables must not be shared by multiple threads.]
*)
val var_matched : variable -> bool
(* returns true if the variable matched a value in the last match_string *)
val string_of_var : variable -> string
(* returns the current value of the variable *)
val found_string_of_var : variable -> string
(* returns the current value of the variable only if there was a match
* for this variable in the last match_string; otherwise raise Not_found
*)
val mkset : string -> charset
(* creates a set from readable description. The string simply enumerates
* the characters of the set, and the notation "x-y" is possible, too.
* To include '-' in the set, put it at the beginning or end.
*)
val mknegset : string -> charset
(* creates the complement that mkset would create *)
(* ---------------------------------------------------------------------- *)
(* EXAMPLE:
*
* let v = var "" in
* let _ = match_string [ Literal "("; Record (v, [Anystring]); Literal ")" ]
* s
* in found_string_of_var v
*
* - if s is "(abc)" returns "abc"
* - if the parantheses are missing, raises Not_found
*
* VARIANT I:
*
* let v = var "" in
* let _ = match_string [ Lazystring;
* Literal "("; Record (v, [Lazystring]); Literal ")";
* Anystring ]
* s
* in found_string_of_var v
*
* - finds the first substring with parantheses, e.g.
* s = "abc(def)ghi(jkl)mno" returns "def"
*
* To get the last substring, swap Lazystring and Anystring at the beginning
* resp. end.
*
* VARIANT II:
*
* let v = var "" in
* let _ = match_string [ Lazystring;
* Literal "("; Record (v, [Anystring]); Literal ")";
* Anystring ]
* s
* in found_string_of_var v
*
* - for s = "abc(def)ghi(jkl)mno" it is returned "def)ghi(jkl"
*)
(* ---------------------------------------------------------------------- *)
(* EXAMPLE:
*
* let v = var "" in
* let digits = mkset "0-9" in
* let digits_re = [ Record(v, [ Anychar_from digits; Anystring_from digits])]
* in
* replace_matched_substrings digits_re [ ReplaceLiteral "D" ] [] "ab012cd456fg"
*
* yields: ("abDcdDfg", 2)
*
* VARIANT I:
*
* replace_matched_substrings digits_re [ ReplaceLiteral "D" ]
* [ Limit 1 ] "ab012cd456fg"
*
* yields: ("abDcd456fg", 1)
*
* VARIANT II:
*
* replace_matched_substrings digits_re [ ReplaceLiteral "D" ]
* [ Anchored ] "ab012cd456fg"
*
* yields: ("ab012cd456fg", 0)
*
* VARIANT III:
*
* replace_matched_substrings digits_re [ ReplaceLiteral "D" ]
* [ Anchored ] "012"
*
* yields: ("D", 1)
*
* VARIANT IV:
*
* let f() = string_of_int(1+int_of_string(string_of_var v)) in
* replace_matched_substrings digits_re [ ReplaceFunction f ]
* [] "ab012cd456fg"
*
* yields: ("ab13cd457fg", 2)
*)
(* ======================================================================
* History:
*
* $Log: xstr_match.mli,v $
* Revision 1.2 1999/07/04 20:02:08 gerd
* Added Lazystring, Lazystring_from.
* Added replace_matched_substring function.
* Changed the structure of 'variable'. 'sref' is either an arbitrary
* string, or it is the input string of the matching function. 'from' and
* 'len' are always used.
*
* Revision 1.1 1999/06/27 23:03:38 gerd
* Initial revision.
*
*
*)