(* $Id: xstr_match.ml,v 1.9 2002/07/07 11:27:16 gerd Exp $ * ---------------------------------------------------------------------- * String matching *) type variable = { mutable sref : string; mutable found : bool; mutable begun : bool; mutable from : int; mutable len : int } ;; type charset = int array;; type matcher = Literal of string | Anystring | Lazystring | Anychar | Anystring_from of charset | Lazystring_from of charset | Anychar_from of charset | Nullstring | Alternative of matcher list list | Optional of matcher list | Record of (variable * matcher list) | Scanner of (string -> int) ;; (**********************************************************************) (* operations on sets *) (* copied from the JavaCaml regexp implementation *) let the_full_set = Array.create 16 0xffff;; let the_empty_set = Array.create 16 0;; let dup_set s = Array.copy s ;; let empty_set () = the_empty_set ;; let full_set () = the_full_set ;; let ( +! ) a b = (* union *) let r = Array.create 16 0 in for i=0 to 15 do r.(i) <- a.(i) lor b.(i) done; r ;; let ( *! ) a b = (* intersection *) let r = Array.create 16 0 in for i=0 to 15 do r.(i) <- a.(i) land b.(i) done; r ;; let ( !! ) a = (* negation *) let r = Array.create 16 0 in for i=0 to 15 do r.(i) <- a.(i) lxor 0xffff done; r ;; let ( ?! ) a = (* not null? *) let n = ref 0 in for i=0 to 15 do n := !n lor a.(i) done; !n <> 0 ;; let set_include a n = (* include in set -- this is in-place modification! *) a.( n lsr 4 ) <- a.( n lsr 4 ) lor (1 lsl (n land 15)) ;; let set_exclude a n = (* exclude from set -- this is in-place modification! *) a.( n lsr 4 ) <- a.( n lsr 4 ) land ((1 lsl (n land 15)) lxor 0xffff) ;; let member_of_set n a = (* (a.( n lsr 4 ) land (1 lsl (n land 15))) <> 0 *) (a.( n lsr 4 ) lsr (n land 15)) land 1 <> 0 ;; let word_set() = let a = dup_set (empty_set()) in List.iter (fun c -> set_include a (Char.code c)) [ 'a'; 'b'; 'c'; 'd'; 'e'; 'f'; 'g'; 'h'; 'i'; 'j'; 'k'; 'l'; 'm'; 'n'; 'o'; 'p'; 'q'; 'r'; 's'; 't'; 'u'; 'v'; 'w'; 'x'; 'y'; 'z'; 'A'; 'B'; 'C'; 'D'; 'E'; 'F'; 'G'; 'H'; 'I'; 'J'; 'K'; 'L'; 'M'; 'N'; 'O'; 'P'; 'Q'; 'R'; 'S'; 'T'; 'U'; 'V'; 'W'; 'X'; 'Y'; 'Z'; '0'; '1'; '2'; '3'; '4'; '5'; '6'; '7'; '8'; '9'; '_' ]; a ;; let noword_set() = let a = word_set() in !! a ;; let set_as_string set = let s = String.make 32 ' ' in for i = 0 to 15 do s.[i+i] <- Char.chr (set.(i) land 0xff); s.[i+i+1] <- Char.chr (set.(i) lsr 8); done; s ;; (**********************************************************************) exception Found of variable list ;; let rec first_character ml = (* return a set of characters s: all non-empty strings that ml matches have * initial characters that are element of s; if ml matches the empty string * then full_set() is returned. *) match ml with [] -> full_set() | Literal "" :: ml' -> first_character ml' | Literal s :: _ -> let cs = dup_set(empty_set()) in set_include cs (Char.code s.[0]); cs | Anystring :: _ -> full_set() | Lazystring :: _ -> full_set() | Anychar :: _ -> full_set() | Anystring_from s :: _ -> full_set() | Lazystring_from s :: _ -> full_set() | Anychar_from s :: _ -> s | Nullstring :: ml' -> first_character ml' | Alternative l :: _ -> List.fold_left (fun s x -> s +! (first_character x)) (empty_set()) l | Optional ml1 :: ml2 -> (first_character ml1) +! (first_character ml2) | Record (v,ml1) :: ml2 -> first_character ml1 | Scanner f :: _ -> full_set() ;; let match_string_at ml s k = let len = String.length s in let rec run k ml recs = (* returns () meaning that nothing has been found, or * Found recs'. * 'k': position in s * 'ml': matcher list to process * 'recs': recorded sections up to now * 'Some recs'': total list of recorded sections *) match ml with [] -> if k = len then raise(Found recs) | Literal x :: ml' -> let xlen = String.length x in begin match xlen with 0 -> run k ml' recs | 1 -> if k+1 <= len && s.[k] = x.[0] then run (k+1) ml' recs | 2 -> if k+2 <= len && s.[k] = x.[0] && s.[k+1] = x.[1] then run (k+2) ml' recs | 3 -> if k+3 <= len && s.[k] = x.[0] && s.[k+1] = x.[1] && s.[k+2] = x.[2] then run (k+3) ml' recs | 4 -> if k+4 <= len && s.[k] = x.[0] && s.[k+1] = x.[1] && s.[k+2] = x.[2] && s.[k+3] = x.[3] then run (k+4) ml' recs | _ -> if k + xlen <= len && String.sub s k xlen = x then run (k+xlen) ml' recs (* this is still not optimal *) end | Anystring :: ml' -> run len ml' recs; let ml'fc = first_character ml' in let rec find n = if n >= 0 then begin if member_of_set (Char.code s.[k+n]) ml'fc then run (k+n) ml' recs; find (n-1) end in find (len-k-1) | Lazystring :: ml' -> let ml'fc = first_character ml' in let max = len-k in let max3 = max - 3 in let rec find n = if n < max3 then begin let c1 = Char.code s.[k+n] in if member_of_set c1 ml'fc then run (k+n) ml' recs; let c2 = Char.code s.[k+n+1] in if member_of_set c2 ml'fc then run (k+n+1) ml' recs; let c3 = Char.code s.[k+n+2] in if member_of_set c3 ml'fc then run (k+n+2) ml' recs; let c4 = Char.code s.[k+n+3] in if member_of_set c4 ml'fc then run (k+n+3) ml' recs; find (n+4) end else if n <= max then begin run (k+n) ml' recs; find (n+1) end in find 0 | Anystring_from set :: ml' -> let rec region n = if k+n < len then let c = Char.code (s.[k+n]) in if member_of_set c set then region (n+1) else n else n in let max = region 0 in run (k+max) ml' recs; let ml'fc = first_character ml' in let rec find n = if n >= 3 then begin if member_of_set (Char.code s.[k+n]) ml'fc then run (k+n) ml' recs; if member_of_set (Char.code s.[k+n-1]) ml'fc then run (k+n-1) ml' recs; if member_of_set (Char.code s.[k+n-2]) ml'fc then run (k+n-2) ml' recs; if member_of_set (Char.code s.[k+n-3]) ml'fc then run (k+n-3) ml' recs; find (n-4) end else if n >= 0 then begin if member_of_set (Char.code s.[k+n]) ml'fc then run (k+n) ml' recs; find (n-1) end in find (max-1) | Lazystring_from set :: ml' -> let rec region n = if k+n < len then let c = Char.code (s.[k+n]) in if member_of_set c set then region (n+1) else n else n in let max = region 0 in let ml'fc = first_character ml' in let rec find n = if n < max then begin if member_of_set (Char.code s.[k+n]) ml'fc then run (k+n) ml' recs; find (n+1) end else if n = max then run (k+max) ml' recs in find 0 | Anychar :: ml' -> if k < len then run (k+1) ml' recs | Anychar_from set :: ml' -> if k < len then let c = Char.code (s.[k]) in if member_of_set c set then run (k+1) ml' recs | Nullstring :: ml' -> run k ml' recs | Alternative alts :: ml' -> let rec find alts = match alts with [] -> () | alt :: alts' -> run k (alt @ ml') recs; find alts' in find alts | Optional opt :: ml' -> run k (opt @ ml') recs; run k ml' recs | Record (r, recorded) :: ml' -> if r.found then failwith "string_match: the same variable matches twice"; if r.begun then begin (* ==> recorded = [] *) let old_len = r.len in r.found <- true; r.len <- k - r.from; run k ml' (r::recs); r.found <- false; r.len <- old_len end else begin let old_from = r.from in r.begun <- true; r.from <- k; run k (recorded @ (Record(r,[]) :: ml')) recs; r.begun <- false; r.from <- old_from end | Scanner f :: ml' -> let n = f (String.sub s k (len-k)) in if k+n > len then failwith "match"; run (k+n) ml' recs in try let recs = try run k ml []; raise Not_found with Found r -> r in List.iter (fun r -> if r.found then r.sref <- s) recs; true with Not_found -> false ;; let match_string ml s = let rec reset ml = match ml with [] -> () | Alternative alts :: ml' -> List.iter reset alts; reset ml' | Optional opt :: ml' -> reset opt; reset ml' | Record (v,r) :: ml' -> v.found <- false; v.begun <- false; reset r; reset ml' | _ :: ml' -> reset ml' in reset ml; match_string_at ml s 0 ;; let var s = { sref = s; found = false; begun = false; from = 0; len = String.length s } ;; type replacer = ReplaceLiteral of string | ReplaceVar of variable | ReplaceFunction of (unit -> string) ;; type rflag = Anchored | Limit of int ;; type repl = RLit of string | RRegion of (int * int) ;; exception Limit_exceeded;; let replace_matched_substrings ml rl fl s = let anchored = List.mem Anchored fl in let all = var "" in let ml' = [ Record(all, ml)] @ (if anchored then [] else [ Anystring ]) in let rec resetlist ml = match ml with [] -> [] | Alternative alts :: ml' -> List.flatten (List.map resetlist alts) @ resetlist ml' | Optional opt :: ml' -> resetlist opt @ resetlist ml' | Record (v,r) :: ml' -> v :: (resetlist r @ resetlist ml') | _ :: ml' -> resetlist ml' in let resl = resetlist ml' in let limit = List.fold_left (fun m f -> match f with Limit n -> if n < 0 then failwith "replace_matched_substrings"; if m >= 0 then min m n else n | _ -> m) (-1) fl in let n_repl = ref 0 in let replace_at k = if limit >= 0 && !n_repl >= limit then [], (-1) else begin List.iter (fun v -> v.found <- false; v.begun <- false) resl; if match_string_at ml' s k then begin (* interpret rl *) try let repltext = List.map (fun r -> match r with ReplaceLiteral s -> RLit s | ReplaceVar v -> if v.found then RRegion (v.from, v.len) else RLit "" | ReplaceFunction f -> begin try RLit (f ()) with Not_found -> raise Not_found | Match_failure (_,_,_) -> raise Not_found end) rl in let amount = all.len in incr n_repl; repltext, amount with Not_found -> [], (-1) end else [], (-1) end in let l = String.length s in let ml'fc = first_character ml' in let rec left_to_right trans k_gapstart k = let rec ltor k = if k < (l-1) then begin if not (member_of_set (Char.code s.[k]) ml'fc ) then begin if not (member_of_set (Char.code s.[k+1]) ml'fc ) then begin ltor (k+2) end else try_match trans k_gapstart (k+1) end else try_match trans k_gapstart k end else if k <= l then (* Note k<=l: this criterion could be much better *) try_match trans k_gapstart k else RRegion(k_gapstart, k-k_gapstart-1) :: trans in ltor k and try_match trans k_gapstart k = let repltext, amount = replace_at k in if amount >= 0 then begin left_to_right (repltext @ [RRegion(k_gapstart, k-k_gapstart)] @ trans) (k + amount) (if amount=0 then k+1 else k+amount) end else left_to_right trans k_gapstart (k+1) in let with_anchors () = try let repltext, amount = replace_at 0 in repltext with Not_found -> [ RRegion(0, l) ] | Limit_exceeded -> [ RRegion(0, l) ] in let rec total_length n trans = match trans with RLit s :: trans' -> total_length (n+String.length s) trans' | RRegion (_,len) :: trans' -> total_length (n+len) trans' | [] -> n in let rec form_replacement_ltor target trans j = match trans with RLit t :: trans' -> let ls = String.length t in let j' = j - ls in if ls > 0 then String.blit t 0 target j' ls; form_replacement_ltor target trans' j' | RRegion (from,len) :: trans' -> let j' = j - len in if len > 0 then String.blit s from target j' len; form_replacement_ltor target trans' j' | [] -> () in (* TODO: interpret rtol, * what's with initialization of variables? *) let transformer = if anchored then with_anchors() else left_to_right [] 0 0 in let length = total_length 0 transformer in let target = String.create length in form_replacement_ltor target transformer length; target, !n_repl ;; let var_matched v = v.found ;; let string_of_var v = String.sub v.sref v.from v.len ;; let found_string_of_var v = if v.found then String.sub v.sref v.from v.len else raise Not_found ;; let mkset s = let l = String.length s in let k = ref (-1) in let c = ref ' ' in let next_noesc() = incr k; if ( !k < l ) then begin c := s.[ !k ]; end in let set = dup_set (empty_set()) in let add_char c = let code = Char.code c in set_include set code in let add_range c1 c2 = let code1 = Char.code c1 in let code2 = Char.code c2 in for i = code1 to code2 do set_include set i done in let continue = ref true in next_noesc(); while !continue && !k < l do match () with | () when (!k + 2 < l) && (s.[!k + 1] = '-') -> (* range *) add_range !c (s.[!k + 2]); next_noesc(); next_noesc(); next_noesc(); | () -> add_char !c; next_noesc(); done; set ;; let mknegset s = !! (mkset s) ;; (* ====================================================================== * History: * * $Log: xstr_match.ml,v $ * Revision 1.9 2002/07/07 11:27:16 gerd * Fixed Xstr_match.mkset * * Revision 1.8 2000/09/23 13:43:22 gerd * Bugfix in replace_matched_substrings. * * Revision 1.7 1999/07/08 02:41:10 gerd * Bugfix in 'Record' matching: the variable was not reset to * its old values in the case the matching fails. * * Revision 1.6 1999/07/06 21:29:33 gerd * Optimizations in the 'replace_matched_substrings' function. * * Revision 1.5 1999/07/06 00:47:53 gerd * Added optimization 'first_character'. * Now 'run' raises an exception in the case of a success, and * otherwise returns () - the exact opposite as before. * Many more optimizations for 'match_string'. * * Revision 1.4 1999/07/05 22:34:57 gerd * match_string: simplifications; now much more tail recursions. * * Revision 1.3 1999/07/05 21:42:46 gerd * Bugfix: When Record(_,_) records in a loop, the state of the * variable was not cleared after every cycle. This is done now. * * Revision 1.2 1999/07/04 20:02:07 gerd * Added Lazystring, Lazystring_from. * Added replace_matched_substring function. * Changed the structure of 'variable'. 'sref' is either an arbitrary * string, or it is the input string of the matching function. 'from' and * 'len' are always used. * * Revision 1.1 1999/06/27 23:03:37 gerd * Initial revision. * * *)