Plasma GitLab Archive
Projects Blog Knowledge

(* $Id$ *)

exception SASLprepError

open Netsaslprep_data

let map (u : int array) =
  let to_space = Hashtbl.create 41 in
  let to_nothing = Hashtbl.create 41 in
  Array.iter (fun p -> Hashtbl.add to_space p ()) map_to_space;
  Array.iter (fun p -> Hashtbl.add to_nothing p ()) map_to_nothing;
  let u0 = Array.to_list u in
  let u1 =
    List.filter
      (fun p -> not (Hashtbl.mem to_nothing p))
      u0 in
  let u2 =
    List.map
      (fun p ->
         if Hashtbl.mem to_space p then
           32
         else
           p
      )
      u1 in
  Array.of_list u2


(* The KC normalizer follows roughly
  https://web.archive.org/web/20070514031407/http://www.unicode.org/unicode/reports/tr15/Normalizer.html
 *)


type buffer =
    { mutable buf : int array;
      mutable len : int
    }


let create_buffer() =
  { buf = Array.make 42 0;
    len = 0
  }

let buffer_at buf k =
  assert(k >= 0 && k < buf.len);
  buf.buf.(k)

let set_buffer_at buf k ch =
  assert(k >= 0 && k < buf.len);
  buf.buf.(k) <- ch

let resize buf =
  let nbuf = Array.make (Array.length buf.buf * 2) 0 in
  Array.blit buf.buf 0 nbuf 0 (Array.length buf.buf);
  buf.buf <- nbuf
             

let insert_at buf k ch =
  assert(k >= 0 && k <= buf.len);
  if buf.len = Array.length buf.buf then resize buf;
  if k < buf.len then
    Array.blit buf.buf k buf.buf (k+1) (buf.len - k);
  buf.buf.(k) <- ch;
  buf.len <- buf.len + 1

let length buf =
  buf.len

let contents buf =
  Array.sub buf.buf 0 buf.len


let get_cano_tab() =
  let cano_tab = Hashtbl.create 41 in
  let last = ref 0 in
  Array.iter
    (fun p ->
       if p < 0 then
         last := -p
       else
         Hashtbl.add cano_tab p !last
    )
    cano_classes;
  cano_tab


let cano_tab =
  (* this table is pretty small *)
  get_cano_tab()



(* Hangul *)
let h_SBase = 0xAC00
let h_LBase = 0x1100
let h_VBase = 0x1161
let h_TBase = 0x11A7
let h_LCount = 19
let h_VCount = 21
let h_TCount = 28
let h_NCount = h_VCount * h_TCount
let h_SCount = h_LCount * h_NCount


let decompose_hangul code =
  if code < h_SBase || code >= h_SBase + h_SCount then raise Not_found;
  let si = code - h_SBase in
  let l = h_LBase + si/h_NCount in
  let v = h_VBase + (si mod h_NCount) / h_TCount in
  let t = h_TBase + si mod h_TCount in
  if t = h_TBase then
    [ l; v ]
  else
    [ l; v; t ]


let compose_hangul first second =
  (* check for L and V *)
  if first >= h_LBase && 
       first < h_LBase + h_LCount &&
         second >= h_VBase &&
           second < h_VBase + h_VCount
  then
    (* create LV syllable *)
    let l = first - h_LBase in
    let v = second - h_VBase in
    h_SBase  +  (l*h_VCount + v) * h_TCount
  else
    (* check for LV and T *)
    let si = first - h_SBase in
    if first >= h_SBase && 
         first < h_SBase + h_SCount && 
           si mod h_TCount = 0
    then
      let ti = second - h_TBase in
      first + ti
    else
      raise Not_found


let decompose (u : int array) =
  (* "compatibility decomposition" as required for NFKC *)
  let decomp_tab = Hashtbl.create 41 in
  let last = ref (ref []) in
  Array.iter
    (fun p ->
       if p < 0 then (
         last := ref [];
         Hashtbl.add decomp_tab ((-p) lsr 1) !last
       )
       else
         !last := p :: ! !last
    )
    decompositions;
  let rec get_recursive_decomp ch =
    try
      let chars = List.rev (! (Hashtbl.find decomp_tab ch)) in
      List.flatten (List.map get_recursive_decomp chars)
    with
      | Not_found ->
           try
             decompose_hangul ch
           with
             | Not_found -> [ch] in
  let get_cc ch = try Hashtbl.find cano_tab ch with Not_found -> 0 in
  let target = create_buffer() in
  for i = 0 to Array.length u - 1 do
    let decomp = get_recursive_decomp u.(i) in
    List.iter
      (fun ch ->
         let cc = get_cc ch in
         let k = ref (length target) in
         if cc <> 0 then (
           while
             !k > 0 &&
               get_cc (buffer_at target (!k-1)) > cc
           do
             decr k
           done
         );
         insert_at target !k ch
      )
      decomp
  done;
  contents target


let compose_1 (u : int array) =
  (* "canonical composition" as required for NFKC *)
  (* u <> [| |] required *)
  let excl_tab = Hashtbl.create 41 in
  Array.iter
    (fun p -> Hashtbl.add excl_tab p ())
    exclusions;
  let comp_tab = Hashtbl.create 41 in
  let last_p = ref 0 in
  let last = ref [] in
  Array.iter
    (fun p ->
       if p < 0 then (
         if !last <> [] then (
           let q = (- !last_p) lsr 1 in
           let is_canonical = (- !last_p) land 1 = 0 in
           let is_excluded = Hashtbl.mem excl_tab q in
           if is_canonical && not is_excluded  then (
             match !last with
               | [ c0 ] ->
                    ()
               | [ c1; c0 ] ->
                    (* NB. We can at most support 15 bits *)
                    assert(c0 < 16384);
                    assert(c1 < 16384);
                    Hashtbl.add comp_tab ((c0 lsl 14) lor c1) q
               | _ ->
                    assert false
           )
         );
         last_p := p;
         last := [];
       )
       else
         last := p :: !last
    )
    decompositions;
  let get_cc ch = try Hashtbl.find cano_tab ch with Not_found -> 0 in
  let target = create_buffer() in
  let starter_pos = ref 0 in
  let starter_ch = ref u.(!starter_pos) in
  let last_class = ref (get_cc !starter_ch) in
  if !last_class <> 0 then last_class := 256;
  insert_at target 0 !starter_ch;
  
  for i = 1 to Array.length u - 1 do
    let ch = u.(i) in
    let cc = get_cc ch in
    try
      let composite =
        try
          if !starter_ch >= 16384 || ch >= 16384 then raise Not_found;
          Hashtbl.find comp_tab ((!starter_ch lsl 14) lor ch)
        with
          | Not_found ->
               compose_hangul !starter_ch ch in
      if !last_class >= cc && !last_class <> 0 then raise Not_found;
      set_buffer_at target !starter_pos composite;
      starter_ch := composite
    with
      | Not_found ->
           if cc = 0 then (
             starter_pos := length target;
             starter_ch := ch;
           );
           last_class := cc;
           insert_at target (length target) ch
  done;
  contents target


let compose u =
  if u = [| |] then [| |] else compose_1 u


let exists f a =
  try
    Array.iter (fun p -> if f p then raise Exit) a;
    false
  with Exit -> true


let norm_needed u =
  (* If the string uses only certain characters we don't need to normalize.
     These are practically all Latin, Greek and Cyrillic characters.
   *)
  let quick_need_norm c =
    not
      ((c >= 0x20 && c <= 0x7e) ||
         (c >= 0xc0 && c <= 0x131) ||
           (c >= 0x134 && c <= 0x13e) ||
             (c >= 0x141 && c <= 0x148) ||
               (c >= 0x14a && c <= 0x17e) ||
                 (c >= 0x180 && c <= 0x1c3) ||
                   (c >= 0x1cd && c <= 0x1f0) ||
                     (c >= 0x1f4 && c <= 0x2ad) ||
                       (c >= 0x374 && c <= 0x375) ||
                         (c = 0x37e) ||
                           (c >= 0x385 && c <= 0x3ce) ||
                             (c >= 0x400 && c <= 0x482) ||
                               (c >= 0x48a && c <= 0x50f)
      ) in
  exists quick_need_norm u


let normalize u =
  (* normalization form KC (NFKC) *)
  if norm_needed u then
    compose (decompose u)
  else
    u


let prohibited u =
  Array.iter
    (fun p -> 
       Array.iter
         (fun (p0, p1) ->
            if p >= p0 && p <= p1 then raise SASLprepError;
         )
         forbidden;
    )
    u;
  u


let is_randalcat c =
  exists
    (fun (c0,c1) -> c >= c0 && c <= c1)
    randalcat


let is_lcat c =
  exists
    (fun (c0,c1) -> c >= c0 && c <= c1)
    lcat


let bidicheck u =
  let u_randalcat =
    Array.map is_randalcat u in
  let u_lcat =
    Array.map is_lcat u in
  let has_randalcat = exists (fun p -> p) u_randalcat in
  let has_lcat = exists (fun p -> p) u_lcat in
  if has_randalcat && has_lcat then raise SASLprepError;
  if has_randalcat && u <> [| |] then (
    if not u_randalcat.(0) || not u_randalcat.(Array.length u - 1) then
      raise SASLprepError
  );
  u


let basecheck u =
  if exists (fun p -> p < 0 || p > 0x10ffff) u then raise SASLprepError;
  ()


let saslprep_a u =
  basecheck u;
  bidicheck ( prohibited (normalize (map u)))


let saslprep s =
  Netconversion.ustring_of_uarray
    `Enc_utf8
    (saslprep_a
       (Netconversion.uarray_of_ustring
          `Enc_utf8
          s
       )
    )

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml