(* $Id: xstr_split.ml,v 1.2 1999/07/06 21:32:09 gerd Exp $
* ----------------------------------------------------------------------
*
*)
type cclass =
CData
| CIgnore
| CSeparator
| CIgnoreOrSeparator
;;
let split_string ignoreset ignoreleft ignoreright separators (* s *) =
(* 'ignoreset': Characters that are ignored before and after the
* separator
* 'ignoreleft': true means to ignore characters from 'ignoreset' at
* the beginning of the string
* 'ignoreright': true means to ignore characters from 'ignoreset' at
* the end of the string
* 'separators': list of separating strings
* 's': string to split
*
* EXAMPLES:
*
* - split_string " " true true [ "," ] "a, b, c ,d "
* = [ "a"; "b"; "c"; "d" ]
* - split_string "" true true [ "," ] "a, b, c ,d "
* = [ "a"; " b"; " c "; "d " ]
* - split_string " " false false [ "," ] "a, b, c ,d "
* = [ "a"; "b"; "c"; "d " ]
*)
let character_classification =
let a = Array.create 256 CData in
let sepchars =
List.flatten
(List.map (fun sep -> if sep <> "" then [sep.[0]] else []) separators)
in
let ignorechars =
let l = ref [] in
for k = 0 to String.length ignoreset - 1 do
l := ignoreset.[k] :: !l
done;
!l
in
List.iter
(fun c ->
a.( Char.code c ) <- CSeparator)
sepchars;
List.iter
(fun c ->
let code = Char.code c in
if a.( code ) = CSeparator then
a.( code ) <- CIgnoreOrSeparator
else
a.( code ) <- CIgnore)
ignorechars;
a
in
fun s ->
let l = String.length s in
let rec split_over_word i_wordbeg i_wordend i_current =
(* i_wordbeg <= i_wordend: i_current has not yet reached the next
* separator. i_wordbeg is the position of
* the first CData character, i_wordend the
* position after the last CData Character of
* the word.
*)
if i_current < l then begin
let code = Char.code (s.[i_current]) in
let cl = character_classification.(code) in
match cl with
CData -> (* split i_wordbeg (i_current+1) (i_current+1) *)
fast_skip_word i_wordbeg (i_current+1)
| CIgnore -> split_over_word i_wordbeg i_wordend (i_current+1)
| _ ->
let rec find_sep sepl =
match sepl with
[] ->
if cl = CSeparator then (* just as CData *)
(* split i_wordbeg (i_current+1) (i_current+1) *)
fast_skip_word i_wordbeg (i_current+1)
else (* just as CIgnore *)
split_over_word i_wordbeg i_wordend (i_current+1)
| sep :: sepl' ->
let lsep = String.length sep in
if i_current + lsep <= l &
String.sub s i_current lsep = sep then
(* found separator *)
String.sub s i_wordbeg (i_wordend - i_wordbeg) ::
split_after_word (i_current + lsep) (i_current + lsep)
else
find_sep sepl'
in
find_sep separators
end
else
(* i_current >= l *)
if ignoreright then
[ String.sub s i_wordbeg (i_wordend - i_wordbeg) ]
else
[ String.sub s i_wordbeg (i_current - i_wordbeg) ]
and split_after_word i_wordbeg i_current =
(* i_wordbeg > i_wordend: i_current is just after the separator and
* searches the next word beginning
*)
if i_current < l then begin
let code = Char.code (s.[i_current]) in
let cl = character_classification.(code) in
match cl with
CData ->
(* split i_current (i_current+1) (i_current+1) *)
fast_skip_word i_current (i_current+1)
| (CIgnore|CIgnoreOrSeparator) ->
split_after_word i_wordbeg (i_current+1)
| CSeparator ->
let rec find_sep sepl =
match sepl with
[] ->
(* split i_wordbeg (i_current+1) (i_current+1) *)
fast_skip_word i_wordbeg (i_current+1)
| sep :: sepl' ->
let lsep = String.length sep in
if i_current + lsep < l &
String.sub s i_current lsep = sep then
(* found separator *)
"" ::
split_after_word (i_current + lsep) (i_current + lsep)
else
find_sep sepl'
in
find_sep separators
end
else
(* i_current >= l *)
if i_wordbeg = 0 then
[] (* not any word found *)
else
[ "" ]
(* Now some frequent special cases *)
and fast_skip_word i_wordbeg i_current =
(* i_wordbeg <= i_current = i_wordend *)
if i_current < l-1 then begin
let code1 = Char.code (s.[i_current]) in
let cl1 = character_classification.(code1) in
match cl1 with
CData ->
begin
let code2 = Char.code (s.[i_current+1]) in
let cl2 = character_classification.(code2) in
match cl2 with
CData -> fast_skip_word i_wordbeg (i_current+2)
| CIgnore -> split_over_word i_wordbeg (i_current+1) (i_current+2)
| _ -> (* continue with the general routine *)
split_over_word i_wordbeg (i_current+1) (i_current+1)
end
| CIgnore -> split_over_word i_wordbeg i_current (i_current+1)
| _ -> (* continue with the general routine *)
split_over_word i_wordbeg i_current i_current
end
else split_over_word i_wordbeg i_current i_current
in
if ignoreleft then
split_after_word 0 0
else
split_over_word 0 0 0
;;
(* ======================================================================
* History:
*
* $Log: xstr_split.ml,v $
* Revision 1.2 1999/07/06 21:32:09 gerd
* Tried to optimize the function; but currently without success.
* There should be deeper analysis -- on the other hand, splitting seems
* to be relative fast compared with the Str splitting function.
* Perhaps the improvements have an effect on machines with bigger caches.
*
* Revision 1.1 1999/06/27 23:03:38 gerd
* Initial revision.
*
*
*)