Plasma GitLab Archive
Projects Blog Knowledge

(* $Id: xstr_match.ml,v 1.9 2002/07/07 11:27:16 gerd Exp $
 * ----------------------------------------------------------------------
 * String matching 
 *)


type variable =
    { mutable sref : string;
      mutable found : bool;
      mutable begun : bool;
      mutable from : int;
      mutable len : int
    } 
;;


type charset = int array;;


type matcher =
    Literal of string
  | Anystring 
  | Lazystring 
  | Anychar
  | Anystring_from of charset
  | Lazystring_from of charset
  | Anychar_from of charset
  | Nullstring
  | Alternative of matcher list list
  | Optional of matcher list
  | Record of (variable * matcher list)
  | Scanner of (string -> int)
;;


(**********************************************************************)

(* operations on sets *)

(* copied from the JavaCaml regexp implementation *)

let the_full_set = Array.create 16 0xffff;;
let the_empty_set = Array.create 16 0;;

let dup_set s =
  Array.copy s
;;


let empty_set () =
  the_empty_set
;;


let full_set () =
  the_full_set
;;


let ( +! ) a b =
  (* union *)
  let r = Array.create 16 0 in
  for i=0 to 15 do
    r.(i) <- a.(i) lor b.(i)
  done;
  r
;;


let ( *! ) a b =
  (* intersection *)
  let r = Array.create 16 0 in
  for i=0 to 15 do
    r.(i) <- a.(i) land b.(i)
  done;
  r
;;


let ( !! ) a =
  (* negation *)
  let r = Array.create 16 0 in
  for i=0 to 15 do
    r.(i) <- a.(i) lxor 0xffff
  done;
  r
;;


let ( ?! ) a =
  (* not null? *)
  let n = ref 0 in
  for i=0 to 15 do
    n := !n lor a.(i)
  done;
  !n <> 0
;;


let set_include a n =
  (* include in set -- this is in-place modification! *)
  a.( n lsr 4 ) <- a.( n lsr 4 ) lor (1 lsl (n land 15))
;;


let set_exclude a n =
  (* exclude from set -- this is in-place modification! *)
  a.( n lsr 4 ) <- a.( n lsr 4 ) land ((1 lsl (n land 15)) lxor 0xffff)
;;


let member_of_set n a =
(*  (a.( n lsr 4 ) land (1 lsl (n land 15))) <> 0 *)
  (a.( n lsr 4 ) lsr (n land 15)) land 1 <> 0
;;


let word_set() =
  let a = dup_set (empty_set()) in
  List.iter
    (fun c ->
      set_include a (Char.code c))
    [ 'a'; 'b'; 'c'; 'd'; 'e'; 'f'; 'g'; 'h'; 'i'; 'j'; 'k'; 'l'; 'm';
      'n'; 'o'; 'p'; 'q'; 'r'; 's'; 't'; 'u'; 'v'; 'w'; 'x'; 'y'; 'z';
      'A'; 'B'; 'C'; 'D'; 'E'; 'F'; 'G'; 'H'; 'I'; 'J'; 'K'; 'L'; 'M';
      'N'; 'O'; 'P'; 'Q'; 'R'; 'S'; 'T'; 'U'; 'V'; 'W'; 'X'; 'Y'; 'Z';
      '0'; '1'; '2'; '3'; '4'; '5'; '6'; '7'; '8'; '9';
      '_' ];
  a
;;


let noword_set() =
  let a = word_set() in
  !! a
;;
    

let set_as_string set =
  let s = String.make 32 ' ' in
  for i = 0 to 15 do
    s.[i+i]   <- Char.chr (set.(i) land 0xff);
    s.[i+i+1] <- Char.chr (set.(i) lsr 8);
  done;
  s
;;


(**********************************************************************)


exception Found of variable list
;;


let rec first_character ml =
  (* return a set of characters s: all non-empty strings that ml matches have
   * initial characters that are element of s; if ml matches the empty string 
   * then full_set() is returned.
   *)
  match ml with
    [] -> full_set()
  | Literal "" :: ml' -> first_character ml'
  | Literal s :: _ -> 
      let cs = dup_set(empty_set()) in
      set_include cs (Char.code s.[0]);
      cs
  | Anystring :: _ -> full_set()
  | Lazystring :: _ -> full_set()
  | Anychar :: _ -> full_set()
  | Anystring_from s :: _ -> full_set()
  | Lazystring_from s :: _ -> full_set()
  | Anychar_from s :: _ -> s
  | Nullstring :: ml' -> first_character ml'
  | Alternative l :: _ ->
      List.fold_left
	(fun s x -> s +! (first_character x))
	(empty_set())
	l
  | Optional ml1 :: ml2 -> (first_character ml1) +! (first_character ml2)
  | Record (v,ml1) :: ml2 -> first_character ml1
  | Scanner f :: _ -> full_set()
;;


let match_string_at ml s k =
  
  let len = String.length s in

  let rec run k ml recs =
    (* returns () meaning that nothing has been found, or
     * Found recs'.
     * 'k': position in s
     * 'ml': matcher list to process
     * 'recs': recorded sections up to now
     * 'Some recs'': total list of recorded sections
     *)
    match ml with
      [] ->
	if k = len then
	  raise(Found recs)
    | Literal x :: ml' ->
	let xlen = String.length x in
	begin match xlen with
	  0 -> run k ml' recs
	| 1 -> if k+1 <= len && s.[k] = x.[0] then run (k+1) ml' recs
	| 2 -> if k+2 <= len && s.[k] = x.[0] && s.[k+1] = x.[1] then 
	       run (k+2) ml' recs
	| 3 -> if k+3 <= len && s.[k] = x.[0] && s.[k+1] = x.[1] &&
	          s.[k+2] = x.[2] then 
	       run (k+3) ml' recs
	| 4 -> if k+4 <= len && s.[k] = x.[0] && s.[k+1] = x.[1] &&
	          s.[k+2] = x.[2] && s.[k+3] = x.[3] then 
	       run (k+4) ml' recs
	| _ -> if k + xlen <= len && String.sub s k xlen = x then
	       run (k+xlen) ml' recs
	          (* this is still not optimal *)
	end
    | Anystring :: ml' ->
	run len ml' recs;
	let ml'fc = first_character ml' in
	let rec find n =
	  if n >= 0 then begin
	    if member_of_set (Char.code s.[k+n]) ml'fc then
	      run (k+n) ml' recs;
	    find (n-1)
	  end
	in
	find (len-k-1)
    | Lazystring :: ml' ->
	let ml'fc = first_character ml' in
	let max = len-k in
	let max3 = max - 3 in
	let rec find n =
	  if n < max3 then begin
	    let c1 = Char.code s.[k+n] in
	    if member_of_set c1 ml'fc then 
	      run (k+n) ml' recs;
	    let c2 = Char.code s.[k+n+1] in
	    if member_of_set c2 ml'fc then
	      run (k+n+1) ml' recs;
	    let c3 = Char.code s.[k+n+2] in
	    if member_of_set c3 ml'fc then
	      run (k+n+2) ml' recs;
	    let c4 = Char.code s.[k+n+3] in
	    if member_of_set c4 ml'fc then
	      run (k+n+3) ml' recs;
	    find (n+4)
	  end
	  else if n <= max then begin
	    run (k+n) ml' recs;
	    find (n+1)
	  end
	in
	find 0
    | Anystring_from set :: ml' ->
	let rec region n =
	  if k+n < len then
	    let c = Char.code (s.[k+n]) in
	    if member_of_set c set then
	      region (n+1)
	    else
	      n
	  else
	    n
	in
	let max = region 0 in
	run (k+max) ml' recs;
	let ml'fc = first_character ml' in
	let rec find n =
	  if n >= 3 then begin
	    if member_of_set (Char.code s.[k+n]) ml'fc then
	      run (k+n) ml' recs;
	    if member_of_set (Char.code s.[k+n-1]) ml'fc then
	      run (k+n-1) ml' recs;
	    if member_of_set (Char.code s.[k+n-2]) ml'fc then
	      run (k+n-2) ml' recs;
	    if member_of_set (Char.code s.[k+n-3]) ml'fc then
	      run (k+n-3) ml' recs;
	    find (n-4)
	  end
	  else if n >= 0 then begin
	    if member_of_set (Char.code s.[k+n]) ml'fc then
	      run (k+n) ml' recs;
	    find (n-1)
	  end
	in
	find (max-1)
    | Lazystring_from set :: ml' ->
	let rec region n =
	  if k+n < len then
	    let c = Char.code (s.[k+n]) in
	    if member_of_set c set then
	      region (n+1)
	    else
	      n
	  else
	    n
	in
	let max = region 0 in
	let ml'fc = first_character ml' in
	let rec find n =
	  if n < max then begin
	    if member_of_set (Char.code s.[k+n]) ml'fc then
	      run (k+n) ml' recs;
	    find (n+1)
	  end 
	  else if n = max then
	    run (k+max) ml' recs
	in
	find 0
    | Anychar :: ml' ->
	if k < len then
	  run (k+1) ml' recs
    | Anychar_from set :: ml' ->
	if k < len then
	  let c = Char.code (s.[k]) in
	  if member_of_set c set then 
	    run (k+1) ml' recs
    | Nullstring :: ml' ->
	run k ml' recs
    | Alternative alts :: ml' ->
	let rec find alts =
	  match alts with
	    [] -> ()
	  | alt :: alts' ->
	      run k (alt @ ml') recs;
	      find alts'
	in
	find alts
    | Optional opt :: ml' ->
	run k (opt @ ml') recs;
	run k ml' recs
    | Record (r, recorded) :: ml' ->
	if r.found then
	  failwith "string_match: the same variable matches twice";
	if r.begun then begin (* ==> recorded = [] *)
	  let old_len = r.len in
	  r.found <- true;
	  r.len <- k - r.from;
	  run k ml' (r::recs);
	  r.found <- false;
	  r.len <- old_len
	end
	else begin
	  let old_from = r.from in
	  r.begun <- true;
	  r.from <- k;
	  run k (recorded @ (Record(r,[]) :: ml')) recs;
	  r.begun <- false;
	  r.from <- old_from
	end
    | Scanner f :: ml' ->
	let n = f (String.sub s k (len-k)) in
	if k+n > len then
	  failwith "match";
	run (k+n) ml' recs

  in

  try
    let recs = 
      try run k ml []; raise Not_found 
      with
      	Found r -> r
    in

    List.iter
      (fun r ->
	if r.found then
	  r.sref <- s)
      recs;
    true
  with
    Not_found ->
      false
;;


let match_string ml s =
  let rec reset ml =
    match ml with
      [] -> ()
    | Alternative alts :: ml' ->
	List.iter reset alts;
	reset ml'
    | Optional opt :: ml' ->
	reset opt;
	reset ml'
    | Record (v,r) :: ml' ->
	v.found <- false;
	v.begun <- false;
	reset r;
	reset ml'
    | _ :: ml' ->
	reset ml'
  in

  reset ml;
  match_string_at ml s 0
;;



let var s =
  { sref = s; found = false; begun = false; from = 0; len = String.length s }
;;



type replacer =
    ReplaceLiteral of string
  | ReplaceVar of variable
  | ReplaceFunction of (unit -> string)
;;


type rflag =
    Anchored
  | Limit of int
;;


type repl =
    RLit of string
  | RRegion of (int * int)
;;


exception Limit_exceeded;;


let replace_matched_substrings ml rl fl s =
  let anchored = List.mem Anchored fl in
  let all = var "" in
  let ml' = [ Record(all, ml)] @ (if anchored then [] else [ Anystring ]) in

  let rec resetlist ml =
    match ml with
      [] -> []
    | Alternative alts :: ml' ->
	List.flatten (List.map resetlist alts) @
	resetlist ml'
    | Optional opt :: ml' ->
	resetlist opt @
	resetlist ml'
    | Record (v,r) :: ml' ->
	v :: (resetlist r @ resetlist ml')
    | _ :: ml' ->
	resetlist ml'
  in

  let resl = resetlist ml' in
  let limit = 
    List.fold_left 
      (fun m f ->
	match f with
	  Limit n -> 
	    if n < 0 then failwith "replace_matched_substrings";
	    if m >= 0 then min m n else n
	| _       -> m)
      (-1)
      fl in

  let n_repl = ref 0 in

  let replace_at k =
    if limit >= 0 && !n_repl >= limit then
      [], (-1)
    else begin
      List.iter
	(fun v -> 
	  v.found <- false;
	  v.begun <- false)
	resl;
      if match_string_at ml' s k then begin
        (* interpret rl *)
	try
	  let repltext =
	    List.map
	      (fun r ->
		match r with
		  ReplaceLiteral s -> RLit s
		| ReplaceVar v -> 
		    if v.found then
		      RRegion (v.from, v.len)
		    else
		      RLit ""
		| ReplaceFunction f ->
		    begin try
		      RLit (f ())
		    with
		      Not_found -> 
			raise Not_found
		    | Match_failure (_,_,_) ->
			raise Not_found
		    end) 
	      rl
	  in
	  let amount = all.len in
	  incr n_repl;
	  repltext, amount
	with
	  Not_found -> [], (-1)
      end
      else [], (-1)
    end
  in

  let l = String.length s in

  let ml'fc = first_character ml' in

  let rec left_to_right trans k_gapstart k =
    let rec ltor k =
      if k < (l-1) then begin
	if not (member_of_set (Char.code s.[k]) ml'fc ) then begin
	  if not (member_of_set (Char.code s.[k+1]) ml'fc ) then begin
	    ltor (k+2)
	  end
	  else try_match trans k_gapstart (k+1)
	end
	else try_match trans k_gapstart k
      end
      else
	if k <= l then 
          (* Note k<=l: this criterion could be much better *)
	  try_match trans k_gapstart k
	else
	  RRegion(k_gapstart, k-k_gapstart-1) :: trans
    in
    ltor k

  and try_match trans k_gapstart k =
    let repltext, amount = replace_at k in
    if amount >= 0 then begin
      left_to_right
	(repltext @ [RRegion(k_gapstart, k-k_gapstart)] @ trans)
	(k + amount)
	(if amount=0 then k+1 else k+amount)
    end 
    else
      left_to_right trans k_gapstart (k+1)
    
  in

  let with_anchors () =
    try
      let repltext, amount = replace_at 0 in
      repltext 
    with
      Not_found ->
	[ RRegion(0, l) ]
    | Limit_exceeded ->
	[ RRegion(0, l) ]
  in

  let rec total_length n trans =
    match trans with
      RLit s :: trans' ->
	total_length (n+String.length s) trans'
    | RRegion (_,len) :: trans' ->
	total_length (n+len) trans'
    | [] ->
	n
  in

  let rec form_replacement_ltor target trans j =
    match trans with
      RLit t :: trans' ->
	let ls = String.length t in
	let j' = j - ls in
	if ls > 0 then String.blit t 0 target j' ls;
	form_replacement_ltor target trans' j'
    | RRegion (from,len) :: trans' ->
	let j' = j - len in
	if len > 0 then String.blit s from target j' len;
	form_replacement_ltor target trans' j'
    | [] -> ()
  in

  (* TODO: interpret rtol, 
   * what's with initialization of variables?
   *)

  let transformer = 
    if anchored then
      with_anchors()
    else
      left_to_right [] 0 0 
  in
  let length = total_length 0 transformer in
  let target = String.create length in
  form_replacement_ltor target transformer length;
  target, !n_repl
;;
  


let var_matched v =
  v.found
;;


let string_of_var v =
  String.sub v.sref v.from v.len
;;


let found_string_of_var v =
  if v.found then String.sub v.sref v.from v.len else raise Not_found
;;


let mkset s =
  let l = String.length s in
  let k = ref (-1) in
  let c = ref ' ' in

  let next_noesc() =
    incr k;
    if ( !k < l ) then begin
      c := s.[ !k ];
    end
  in

  let set = dup_set (empty_set()) in

  let add_char c =
    let code = Char.code c in
    set_include set code
  in

  let add_range c1 c2 =
    let code1 = Char.code c1 in
    let code2 = Char.code c2 in
    for i = code1 to code2 do
      set_include set i
    done
  in

  let continue = ref true in
  next_noesc();

  while !continue && !k < l do
    match () with
    | () when (!k + 2 < l) && (s.[!k + 1] = '-') ->

        (* range *)

	add_range !c (s.[!k + 2]);
	next_noesc();
	next_noesc();
	next_noesc();

    | () ->
	add_char !c;
	next_noesc();
  done;

  set
;;


let mknegset s =
  !! (mkset s)
;;


(* ======================================================================
 * History:
 * 
 * $Log: xstr_match.ml,v $
 * Revision 1.9  2002/07/07 11:27:16  gerd
 * 	Fixed Xstr_match.mkset
 *
 * Revision 1.8  2000/09/23 13:43:22  gerd
 * 	Bugfix in replace_matched_substrings.
 *
 * Revision 1.7  1999/07/08 02:41:10  gerd
 * 	Bugfix in 'Record' matching: the variable was not reset to
 * its old values in the case the matching fails.
 *
 * Revision 1.6  1999/07/06 21:29:33  gerd
 * 	Optimizations in the 'replace_matched_substrings' function.
 *
 * Revision 1.5  1999/07/06 00:47:53  gerd
 * 	Added optimization 'first_character'.
 * 	Now 'run' raises an exception in the case of a success, and
 * otherwise returns () - the exact opposite as before.
 * 	Many more optimizations for 'match_string'.
 *
 * Revision 1.4  1999/07/05 22:34:57  gerd
 * 	match_string: simplifications; now much more tail recursions.
 *
 * Revision 1.3  1999/07/05 21:42:46  gerd
 * 	Bugfix: When Record(_,_) records in a loop, the state of the
 * variable was not cleared after every cycle. This is done now.
 *
 * Revision 1.2  1999/07/04 20:02:07  gerd
 * 	Added Lazystring, Lazystring_from.
 * 	Added replace_matched_substring function.
 * 	Changed the structure of 'variable'. 'sref' is either an arbitrary
 * string, or it is the input string of the matching function. 'from' and
 * 'len' are always used.
 *
 * Revision 1.1  1999/06/27 23:03:37  gerd
 * 	Initial revision.
 *
 * 
 *)

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml