Plasma GitLab Archive
Projects Blog Knowledge

(* $Id: xstr_split.ml,v 1.2 1999/07/06 21:32:09 gerd Exp $
 * ----------------------------------------------------------------------
 *
 *)


type cclass =
    CData
  | CIgnore
  | CSeparator
  | CIgnoreOrSeparator
;;


let split_string ignoreset ignoreleft ignoreright separators (* s *) =
  (* 'ignoreset': Characters that are ignored before and after the
   *              separator
   * 'ignoreleft': true means to ignore characters from 'ignoreset' at
   *               the beginning of the string
   * 'ignoreright': true means to ignore characters from 'ignoreset' at
   *                the end of the string
   * 'separators': list of separating strings
   * 's': string to split
   *
   * EXAMPLES:
   *
   * - split_string " " true true [ "," ] "a, b, c ,d " 
   *   = [ "a"; "b"; "c"; "d" ]
   * - split_string "" true true [ "," ] "a, b, c ,d " 
   *   = [ "a"; " b"; " c "; "d " ]
   * - split_string " " false false [ "," ] "a, b, c ,d " 
   *   = [ "a"; "b"; "c"; "d " ]
   *)
  let character_classification =
    let a = Array.create 256 CData in
    let sepchars =
      List.flatten 
	(List.map (fun sep -> if sep <> "" then [sep.[0]] else []) separators)
    in
    let ignorechars =
      let l = ref [] in
      for k = 0 to String.length ignoreset - 1 do
	l := ignoreset.[k] :: !l
      done;
      !l
    in
    List.iter
      (fun c ->
	a.( Char.code c ) <- CSeparator)
      sepchars;
    List.iter
      (fun c ->
	let code = Char.code c in
	if a.( code ) = CSeparator then
	  a.( code ) <- CIgnoreOrSeparator
	else
	  a.( code ) <- CIgnore)
      ignorechars;
    a
  in
  fun s ->
    let l = String.length s in
    let rec split_over_word i_wordbeg i_wordend i_current =
      (* i_wordbeg <= i_wordend: i_current has not yet reached the next
       *                         separator. i_wordbeg is the position of 
       *                         the first CData character, i_wordend the
       *                         position after the last CData Character of
       *                         the word.
       *)
      if i_current < l then begin
	let code = Char.code (s.[i_current]) in
	let cl = character_classification.(code) in
	match cl with
	  CData -> (* split i_wordbeg (i_current+1) (i_current+1) *)
                   fast_skip_word i_wordbeg (i_current+1)
	| CIgnore -> split_over_word i_wordbeg i_wordend (i_current+1)
	| _ ->
	    let rec find_sep sepl =
	      match sepl with
		[] -> 
		  if cl = CSeparator then (* just as CData *)
		    (* split i_wordbeg (i_current+1) (i_current+1) *)
		    fast_skip_word i_wordbeg (i_current+1)
		  else (* just as CIgnore *)
		    split_over_word i_wordbeg i_wordend (i_current+1)
	      | sep :: sepl' ->
		  let lsep = String.length sep in
		  if i_current + lsep <= l & 
		    String.sub s i_current lsep = sep then
		      (* found separator *)
		    String.sub s i_wordbeg (i_wordend - i_wordbeg) ::
		    split_after_word (i_current + lsep) (i_current + lsep)
		  else
		    find_sep sepl'
	    in
	    find_sep separators
      end
      else
	(* i_current >= l *)
	if ignoreright then
	  [ String.sub s i_wordbeg (i_wordend - i_wordbeg) ]
	else
    	  [ String.sub s i_wordbeg (i_current - i_wordbeg) ]

    and split_after_word i_wordbeg i_current =
      (* i_wordbeg > i_wordend:  i_current is just after the separator and
       *                         searches the next word beginning
       *)
      if i_current < l then begin
	let code = Char.code (s.[i_current]) in
	let cl = character_classification.(code) in
	match cl with
	  CData -> 
	      (* split i_current (i_current+1) (i_current+1) *)
	      fast_skip_word i_current (i_current+1)
	| (CIgnore|CIgnoreOrSeparator) -> 
	    split_after_word i_wordbeg (i_current+1)
	| CSeparator ->
	    let rec find_sep sepl =
	      match sepl with
		[] -> 
		    (* split i_wordbeg (i_current+1) (i_current+1) *)
		    fast_skip_word i_wordbeg (i_current+1)
	      | sep :: sepl' ->
		  let lsep = String.length sep in
		  if i_current + lsep < l & 
		    String.sub s i_current lsep = sep then
		      (* found separator *)
		    "" ::
		    split_after_word (i_current + lsep) (i_current + lsep)
		  else
		    find_sep sepl'
	    in
	    find_sep separators
      end
      else
	(* i_current >= l *)
	if i_wordbeg = 0 then
	  []  (* not any word found *)
	else
	  [ "" ]

    (* Now some frequent special cases *)

    and fast_skip_word i_wordbeg i_current =
      (* i_wordbeg <= i_current = i_wordend *)
      if i_current < l-1 then begin
	let code1 = Char.code (s.[i_current]) in
	let cl1 = character_classification.(code1) in
	match cl1 with
	  CData -> 
	    begin 
	      let code2 = Char.code (s.[i_current+1]) in
	      let cl2 = character_classification.(code2) in
	      match cl2 with
		CData -> fast_skip_word i_wordbeg (i_current+2)
	      | CIgnore -> split_over_word i_wordbeg (i_current+1) (i_current+2)
	      |	_ -> (* continue with the general routine *)
		     split_over_word i_wordbeg (i_current+1) (i_current+1)
	    end
	| CIgnore -> split_over_word i_wordbeg i_current (i_current+1)
	| _     -> (* continue with the general routine *)
    	           split_over_word i_wordbeg i_current i_current
      end
      else split_over_word i_wordbeg i_current i_current


    in
    if ignoreleft then
      split_after_word 0 0
    else
      split_over_word 0 0 0
;;




(* ======================================================================
 * History:
 * 
 * $Log: xstr_split.ml,v $
 * Revision 1.2  1999/07/06 21:32:09  gerd
 * 	Tried to optimize the function; but currently without success.
 * There should be deeper analysis -- on the other hand, splitting seems
 * to be relative fast compared with the Str splitting function.
 * Perhaps the improvements have an effect on machines with bigger caches.
 *
 * Revision 1.1  1999/06/27 23:03:38  gerd
 * 	Initial revision.
 *
 * 
 *)

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml