(* $Id: xstr_match.ml,v 1.9 2002/07/07 11:27:16 gerd Exp $
* ----------------------------------------------------------------------
* String matching
*)
type variable =
{ mutable sref : string;
mutable found : bool;
mutable begun : bool;
mutable from : int;
mutable len : int
}
;;
type charset = int array;;
type matcher =
Literal of string
| Anystring
| Lazystring
| Anychar
| Anystring_from of charset
| Lazystring_from of charset
| Anychar_from of charset
| Nullstring
| Alternative of matcher list list
| Optional of matcher list
| Record of (variable * matcher list)
| Scanner of (string -> int)
;;
(**********************************************************************)
(* operations on sets *)
(* copied from the JavaCaml regexp implementation *)
let the_full_set = Array.create 16 0xffff;;
let the_empty_set = Array.create 16 0;;
let dup_set s =
Array.copy s
;;
let empty_set () =
the_empty_set
;;
let full_set () =
the_full_set
;;
let ( +! ) a b =
(* union *)
let r = Array.create 16 0 in
for i=0 to 15 do
r.(i) <- a.(i) lor b.(i)
done;
r
;;
let ( *! ) a b =
(* intersection *)
let r = Array.create 16 0 in
for i=0 to 15 do
r.(i) <- a.(i) land b.(i)
done;
r
;;
let ( !! ) a =
(* negation *)
let r = Array.create 16 0 in
for i=0 to 15 do
r.(i) <- a.(i) lxor 0xffff
done;
r
;;
let ( ?! ) a =
(* not null? *)
let n = ref 0 in
for i=0 to 15 do
n := !n lor a.(i)
done;
!n <> 0
;;
let set_include a n =
(* include in set -- this is in-place modification! *)
a.( n lsr 4 ) <- a.( n lsr 4 ) lor (1 lsl (n land 15))
;;
let set_exclude a n =
(* exclude from set -- this is in-place modification! *)
a.( n lsr 4 ) <- a.( n lsr 4 ) land ((1 lsl (n land 15)) lxor 0xffff)
;;
let member_of_set n a =
(* (a.( n lsr 4 ) land (1 lsl (n land 15))) <> 0 *)
(a.( n lsr 4 ) lsr (n land 15)) land 1 <> 0
;;
let word_set() =
let a = dup_set (empty_set()) in
List.iter
(fun c ->
set_include a (Char.code c))
[ 'a'; 'b'; 'c'; 'd'; 'e'; 'f'; 'g'; 'h'; 'i'; 'j'; 'k'; 'l'; 'm';
'n'; 'o'; 'p'; 'q'; 'r'; 's'; 't'; 'u'; 'v'; 'w'; 'x'; 'y'; 'z';
'A'; 'B'; 'C'; 'D'; 'E'; 'F'; 'G'; 'H'; 'I'; 'J'; 'K'; 'L'; 'M';
'N'; 'O'; 'P'; 'Q'; 'R'; 'S'; 'T'; 'U'; 'V'; 'W'; 'X'; 'Y'; 'Z';
'0'; '1'; '2'; '3'; '4'; '5'; '6'; '7'; '8'; '9';
'_' ];
a
;;
let noword_set() =
let a = word_set() in
!! a
;;
let set_as_string set =
let s = String.make 32 ' ' in
for i = 0 to 15 do
s.[i+i] <- Char.chr (set.(i) land 0xff);
s.[i+i+1] <- Char.chr (set.(i) lsr 8);
done;
s
;;
(**********************************************************************)
exception Found of variable list
;;
let rec first_character ml =
(* return a set of characters s: all non-empty strings that ml matches have
* initial characters that are element of s; if ml matches the empty string
* then full_set() is returned.
*)
match ml with
[] -> full_set()
| Literal "" :: ml' -> first_character ml'
| Literal s :: _ ->
let cs = dup_set(empty_set()) in
set_include cs (Char.code s.[0]);
cs
| Anystring :: _ -> full_set()
| Lazystring :: _ -> full_set()
| Anychar :: _ -> full_set()
| Anystring_from s :: _ -> full_set()
| Lazystring_from s :: _ -> full_set()
| Anychar_from s :: _ -> s
| Nullstring :: ml' -> first_character ml'
| Alternative l :: _ ->
List.fold_left
(fun s x -> s +! (first_character x))
(empty_set())
l
| Optional ml1 :: ml2 -> (first_character ml1) +! (first_character ml2)
| Record (v,ml1) :: ml2 -> first_character ml1
| Scanner f :: _ -> full_set()
;;
let match_string_at ml s k =
let len = String.length s in
let rec run k ml recs =
(* returns () meaning that nothing has been found, or
* Found recs'.
* 'k': position in s
* 'ml': matcher list to process
* 'recs': recorded sections up to now
* 'Some recs'': total list of recorded sections
*)
match ml with
[] ->
if k = len then
raise(Found recs)
| Literal x :: ml' ->
let xlen = String.length x in
begin match xlen with
0 -> run k ml' recs
| 1 -> if k+1 <= len && s.[k] = x.[0] then run (k+1) ml' recs
| 2 -> if k+2 <= len && s.[k] = x.[0] && s.[k+1] = x.[1] then
run (k+2) ml' recs
| 3 -> if k+3 <= len && s.[k] = x.[0] && s.[k+1] = x.[1] &&
s.[k+2] = x.[2] then
run (k+3) ml' recs
| 4 -> if k+4 <= len && s.[k] = x.[0] && s.[k+1] = x.[1] &&
s.[k+2] = x.[2] && s.[k+3] = x.[3] then
run (k+4) ml' recs
| _ -> if k + xlen <= len && String.sub s k xlen = x then
run (k+xlen) ml' recs
(* this is still not optimal *)
end
| Anystring :: ml' ->
run len ml' recs;
let ml'fc = first_character ml' in
let rec find n =
if n >= 0 then begin
if member_of_set (Char.code s.[k+n]) ml'fc then
run (k+n) ml' recs;
find (n-1)
end
in
find (len-k-1)
| Lazystring :: ml' ->
let ml'fc = first_character ml' in
let max = len-k in
let max3 = max - 3 in
let rec find n =
if n < max3 then begin
let c1 = Char.code s.[k+n] in
if member_of_set c1 ml'fc then
run (k+n) ml' recs;
let c2 = Char.code s.[k+n+1] in
if member_of_set c2 ml'fc then
run (k+n+1) ml' recs;
let c3 = Char.code s.[k+n+2] in
if member_of_set c3 ml'fc then
run (k+n+2) ml' recs;
let c4 = Char.code s.[k+n+3] in
if member_of_set c4 ml'fc then
run (k+n+3) ml' recs;
find (n+4)
end
else if n <= max then begin
run (k+n) ml' recs;
find (n+1)
end
in
find 0
| Anystring_from set :: ml' ->
let rec region n =
if k+n < len then
let c = Char.code (s.[k+n]) in
if member_of_set c set then
region (n+1)
else
n
else
n
in
let max = region 0 in
run (k+max) ml' recs;
let ml'fc = first_character ml' in
let rec find n =
if n >= 3 then begin
if member_of_set (Char.code s.[k+n]) ml'fc then
run (k+n) ml' recs;
if member_of_set (Char.code s.[k+n-1]) ml'fc then
run (k+n-1) ml' recs;
if member_of_set (Char.code s.[k+n-2]) ml'fc then
run (k+n-2) ml' recs;
if member_of_set (Char.code s.[k+n-3]) ml'fc then
run (k+n-3) ml' recs;
find (n-4)
end
else if n >= 0 then begin
if member_of_set (Char.code s.[k+n]) ml'fc then
run (k+n) ml' recs;
find (n-1)
end
in
find (max-1)
| Lazystring_from set :: ml' ->
let rec region n =
if k+n < len then
let c = Char.code (s.[k+n]) in
if member_of_set c set then
region (n+1)
else
n
else
n
in
let max = region 0 in
let ml'fc = first_character ml' in
let rec find n =
if n < max then begin
if member_of_set (Char.code s.[k+n]) ml'fc then
run (k+n) ml' recs;
find (n+1)
end
else if n = max then
run (k+max) ml' recs
in
find 0
| Anychar :: ml' ->
if k < len then
run (k+1) ml' recs
| Anychar_from set :: ml' ->
if k < len then
let c = Char.code (s.[k]) in
if member_of_set c set then
run (k+1) ml' recs
| Nullstring :: ml' ->
run k ml' recs
| Alternative alts :: ml' ->
let rec find alts =
match alts with
[] -> ()
| alt :: alts' ->
run k (alt @ ml') recs;
find alts'
in
find alts
| Optional opt :: ml' ->
run k (opt @ ml') recs;
run k ml' recs
| Record (r, recorded) :: ml' ->
if r.found then
failwith "string_match: the same variable matches twice";
if r.begun then begin (* ==> recorded = [] *)
let old_len = r.len in
r.found <- true;
r.len <- k - r.from;
run k ml' (r::recs);
r.found <- false;
r.len <- old_len
end
else begin
let old_from = r.from in
r.begun <- true;
r.from <- k;
run k (recorded @ (Record(r,[]) :: ml')) recs;
r.begun <- false;
r.from <- old_from
end
| Scanner f :: ml' ->
let n = f (String.sub s k (len-k)) in
if k+n > len then
failwith "match";
run (k+n) ml' recs
in
try
let recs =
try run k ml []; raise Not_found
with
Found r -> r
in
List.iter
(fun r ->
if r.found then
r.sref <- s)
recs;
true
with
Not_found ->
false
;;
let match_string ml s =
let rec reset ml =
match ml with
[] -> ()
| Alternative alts :: ml' ->
List.iter reset alts;
reset ml'
| Optional opt :: ml' ->
reset opt;
reset ml'
| Record (v,r) :: ml' ->
v.found <- false;
v.begun <- false;
reset r;
reset ml'
| _ :: ml' ->
reset ml'
in
reset ml;
match_string_at ml s 0
;;
let var s =
{ sref = s; found = false; begun = false; from = 0; len = String.length s }
;;
type replacer =
ReplaceLiteral of string
| ReplaceVar of variable
| ReplaceFunction of (unit -> string)
;;
type rflag =
Anchored
| Limit of int
;;
type repl =
RLit of string
| RRegion of (int * int)
;;
exception Limit_exceeded;;
let replace_matched_substrings ml rl fl s =
let anchored = List.mem Anchored fl in
let all = var "" in
let ml' = [ Record(all, ml)] @ (if anchored then [] else [ Anystring ]) in
let rec resetlist ml =
match ml with
[] -> []
| Alternative alts :: ml' ->
List.flatten (List.map resetlist alts) @
resetlist ml'
| Optional opt :: ml' ->
resetlist opt @
resetlist ml'
| Record (v,r) :: ml' ->
v :: (resetlist r @ resetlist ml')
| _ :: ml' ->
resetlist ml'
in
let resl = resetlist ml' in
let limit =
List.fold_left
(fun m f ->
match f with
Limit n ->
if n < 0 then failwith "replace_matched_substrings";
if m >= 0 then min m n else n
| _ -> m)
(-1)
fl in
let n_repl = ref 0 in
let replace_at k =
if limit >= 0 && !n_repl >= limit then
[], (-1)
else begin
List.iter
(fun v ->
v.found <- false;
v.begun <- false)
resl;
if match_string_at ml' s k then begin
(* interpret rl *)
try
let repltext =
List.map
(fun r ->
match r with
ReplaceLiteral s -> RLit s
| ReplaceVar v ->
if v.found then
RRegion (v.from, v.len)
else
RLit ""
| ReplaceFunction f ->
begin try
RLit (f ())
with
Not_found ->
raise Not_found
| Match_failure (_,_,_) ->
raise Not_found
end)
rl
in
let amount = all.len in
incr n_repl;
repltext, amount
with
Not_found -> [], (-1)
end
else [], (-1)
end
in
let l = String.length s in
let ml'fc = first_character ml' in
let rec left_to_right trans k_gapstart k =
let rec ltor k =
if k < (l-1) then begin
if not (member_of_set (Char.code s.[k]) ml'fc ) then begin
if not (member_of_set (Char.code s.[k+1]) ml'fc ) then begin
ltor (k+2)
end
else try_match trans k_gapstart (k+1)
end
else try_match trans k_gapstart k
end
else
if k <= l then
(* Note k<=l: this criterion could be much better *)
try_match trans k_gapstart k
else
RRegion(k_gapstart, k-k_gapstart-1) :: trans
in
ltor k
and try_match trans k_gapstart k =
let repltext, amount = replace_at k in
if amount >= 0 then begin
left_to_right
(repltext @ [RRegion(k_gapstart, k-k_gapstart)] @ trans)
(k + amount)
(if amount=0 then k+1 else k+amount)
end
else
left_to_right trans k_gapstart (k+1)
in
let with_anchors () =
try
let repltext, amount = replace_at 0 in
repltext
with
Not_found ->
[ RRegion(0, l) ]
| Limit_exceeded ->
[ RRegion(0, l) ]
in
let rec total_length n trans =
match trans with
RLit s :: trans' ->
total_length (n+String.length s) trans'
| RRegion (_,len) :: trans' ->
total_length (n+len) trans'
| [] ->
n
in
let rec form_replacement_ltor target trans j =
match trans with
RLit t :: trans' ->
let ls = String.length t in
let j' = j - ls in
if ls > 0 then String.blit t 0 target j' ls;
form_replacement_ltor target trans' j'
| RRegion (from,len) :: trans' ->
let j' = j - len in
if len > 0 then String.blit s from target j' len;
form_replacement_ltor target trans' j'
| [] -> ()
in
(* TODO: interpret rtol,
* what's with initialization of variables?
*)
let transformer =
if anchored then
with_anchors()
else
left_to_right [] 0 0
in
let length = total_length 0 transformer in
let target = String.create length in
form_replacement_ltor target transformer length;
target, !n_repl
;;
let var_matched v =
v.found
;;
let string_of_var v =
String.sub v.sref v.from v.len
;;
let found_string_of_var v =
if v.found then String.sub v.sref v.from v.len else raise Not_found
;;
let mkset s =
let l = String.length s in
let k = ref (-1) in
let c = ref ' ' in
let next_noesc() =
incr k;
if ( !k < l ) then begin
c := s.[ !k ];
end
in
let set = dup_set (empty_set()) in
let add_char c =
let code = Char.code c in
set_include set code
in
let add_range c1 c2 =
let code1 = Char.code c1 in
let code2 = Char.code c2 in
for i = code1 to code2 do
set_include set i
done
in
let continue = ref true in
next_noesc();
while !continue && !k < l do
match () with
| () when (!k + 2 < l) && (s.[!k + 1] = '-') ->
(* range *)
add_range !c (s.[!k + 2]);
next_noesc();
next_noesc();
next_noesc();
| () ->
add_char !c;
next_noesc();
done;
set
;;
let mknegset s =
!! (mkset s)
;;
(* ======================================================================
* History:
*
* $Log: xstr_match.ml,v $
* Revision 1.9 2002/07/07 11:27:16 gerd
* Fixed Xstr_match.mkset
*
* Revision 1.8 2000/09/23 13:43:22 gerd
* Bugfix in replace_matched_substrings.
*
* Revision 1.7 1999/07/08 02:41:10 gerd
* Bugfix in 'Record' matching: the variable was not reset to
* its old values in the case the matching fails.
*
* Revision 1.6 1999/07/06 21:29:33 gerd
* Optimizations in the 'replace_matched_substrings' function.
*
* Revision 1.5 1999/07/06 00:47:53 gerd
* Added optimization 'first_character'.
* Now 'run' raises an exception in the case of a success, and
* otherwise returns () - the exact opposite as before.
* Many more optimizations for 'match_string'.
*
* Revision 1.4 1999/07/05 22:34:57 gerd
* match_string: simplifications; now much more tail recursions.
*
* Revision 1.3 1999/07/05 21:42:46 gerd
* Bugfix: When Record(_,_) records in a loop, the state of the
* variable was not cleared after every cycle. This is done now.
*
* Revision 1.2 1999/07/04 20:02:07 gerd
* Added Lazystring, Lazystring_from.
* Added replace_matched_substring function.
* Changed the structure of 'variable'. 'sref' is either an arbitrary
* string, or it is the input string of the matching function. 'from' and
* 'len' are always used.
*
* Revision 1.1 1999/06/27 23:03:37 gerd
* Initial revision.
*
*
*)