(* $Id: xstr_split.ml,v 1.2 1999/07/06 21:32:09 gerd Exp $ * ---------------------------------------------------------------------- * *) type cclass = CData | CIgnore | CSeparator | CIgnoreOrSeparator ;; let split_string ignoreset ignoreleft ignoreright separators (* s *) = (* 'ignoreset': Characters that are ignored before and after the * separator * 'ignoreleft': true means to ignore characters from 'ignoreset' at * the beginning of the string * 'ignoreright': true means to ignore characters from 'ignoreset' at * the end of the string * 'separators': list of separating strings * 's': string to split * * EXAMPLES: * * - split_string " " true true [ "," ] "a, b, c ,d " * = [ "a"; "b"; "c"; "d" ] * - split_string "" true true [ "," ] "a, b, c ,d " * = [ "a"; " b"; " c "; "d " ] * - split_string " " false false [ "," ] "a, b, c ,d " * = [ "a"; "b"; "c"; "d " ] *) let character_classification = let a = Array.create 256 CData in let sepchars = List.flatten (List.map (fun sep -> if sep <> "" then [sep.[0]] else []) separators) in let ignorechars = let l = ref [] in for k = 0 to String.length ignoreset - 1 do l := ignoreset.[k] :: !l done; !l in List.iter (fun c -> a.( Char.code c ) <- CSeparator) sepchars; List.iter (fun c -> let code = Char.code c in if a.( code ) = CSeparator then a.( code ) <- CIgnoreOrSeparator else a.( code ) <- CIgnore) ignorechars; a in fun s -> let l = String.length s in let rec split_over_word i_wordbeg i_wordend i_current = (* i_wordbeg <= i_wordend: i_current has not yet reached the next * separator. i_wordbeg is the position of * the first CData character, i_wordend the * position after the last CData Character of * the word. *) if i_current < l then begin let code = Char.code (s.[i_current]) in let cl = character_classification.(code) in match cl with CData -> (* split i_wordbeg (i_current+1) (i_current+1) *) fast_skip_word i_wordbeg (i_current+1) | CIgnore -> split_over_word i_wordbeg i_wordend (i_current+1) | _ -> let rec find_sep sepl = match sepl with [] -> if cl = CSeparator then (* just as CData *) (* split i_wordbeg (i_current+1) (i_current+1) *) fast_skip_word i_wordbeg (i_current+1) else (* just as CIgnore *) split_over_word i_wordbeg i_wordend (i_current+1) | sep :: sepl' -> let lsep = String.length sep in if i_current + lsep <= l & String.sub s i_current lsep = sep then (* found separator *) String.sub s i_wordbeg (i_wordend - i_wordbeg) :: split_after_word (i_current + lsep) (i_current + lsep) else find_sep sepl' in find_sep separators end else (* i_current >= l *) if ignoreright then [ String.sub s i_wordbeg (i_wordend - i_wordbeg) ] else [ String.sub s i_wordbeg (i_current - i_wordbeg) ] and split_after_word i_wordbeg i_current = (* i_wordbeg > i_wordend: i_current is just after the separator and * searches the next word beginning *) if i_current < l then begin let code = Char.code (s.[i_current]) in let cl = character_classification.(code) in match cl with CData -> (* split i_current (i_current+1) (i_current+1) *) fast_skip_word i_current (i_current+1) | (CIgnore|CIgnoreOrSeparator) -> split_after_word i_wordbeg (i_current+1) | CSeparator -> let rec find_sep sepl = match sepl with [] -> (* split i_wordbeg (i_current+1) (i_current+1) *) fast_skip_word i_wordbeg (i_current+1) | sep :: sepl' -> let lsep = String.length sep in if i_current + lsep < l & String.sub s i_current lsep = sep then (* found separator *) "" :: split_after_word (i_current + lsep) (i_current + lsep) else find_sep sepl' in find_sep separators end else (* i_current >= l *) if i_wordbeg = 0 then [] (* not any word found *) else [ "" ] (* Now some frequent special cases *) and fast_skip_word i_wordbeg i_current = (* i_wordbeg <= i_current = i_wordend *) if i_current < l-1 then begin let code1 = Char.code (s.[i_current]) in let cl1 = character_classification.(code1) in match cl1 with CData -> begin let code2 = Char.code (s.[i_current+1]) in let cl2 = character_classification.(code2) in match cl2 with CData -> fast_skip_word i_wordbeg (i_current+2) | CIgnore -> split_over_word i_wordbeg (i_current+1) (i_current+2) | _ -> (* continue with the general routine *) split_over_word i_wordbeg (i_current+1) (i_current+1) end | CIgnore -> split_over_word i_wordbeg i_current (i_current+1) | _ -> (* continue with the general routine *) split_over_word i_wordbeg i_current i_current end else split_over_word i_wordbeg i_current i_current in if ignoreleft then split_after_word 0 0 else split_over_word 0 0 0 ;; (* ====================================================================== * History: * * $Log: xstr_split.ml,v $ * Revision 1.2 1999/07/06 21:32:09 gerd * Tried to optimize the function; but currently without success. * There should be deeper analysis -- on the other hand, splitting seems * to be relative fast compared with the Str splitting function. * Perhaps the improvements have an effect on machines with bigger caches. * * Revision 1.1 1999/06/27 23:03:38 gerd * Initial revision. * * *)