Plasma GitLab Archive
Projects Blog Knowledge

open Printf

(* SASLprep requires Unicode-3.2 *)

let exclusions_file =
  "tmp/CompositionExclusions-3.2.0.txt"

let unicode_file =
  "tmp/UnicodeData-3.2.0.txt"

let hex_re =
  Str.regexp "\\([0-9a-fA-F]+\\)"

let ( ||| ) x y =
  if x<>0 then x else y



let print_array l =
  let first = ref true in
  List.iter
    (fun n ->
       if not !first then printf ";\n     ";
       printf "%d" n;
       first := false;
    )
    l


let print_pairs l =
  let first = ref true in
  List.iter
    (fun (p1,p2) ->
       if not !first then printf ";\n     ";
       printf "(%d,%d)" p1 p2;
       first := false;
    )
    l


let exclusions() =
  let f = open_in exclusions_file in
  let l = ref [] in
  ( try
      while true do
        let line = input_line f in
        if line <> "" && line.[0] <> '#' then (
          if Str.string_match hex_re line 0 then (
            let x = Str.matched_group 1 line in
            let n = int_of_string ("0x" ^ x) in
            l := n :: !l
          )
        )
      done
    with End_of_file -> ()
  );
  printf "let exclusions =\n  [| ";
  print_array (List.rev !l);
  printf " |]\n\n%!";
  close_in f



let semi_re = Str.regexp ";"
let space_re = Str.regexp " "

let decompositions() =
  let f = open_in unicode_file in
  let lnr = ref 1 in
  let cano_classes = ref [] in
  let decomps = ref [] in
  let randalcats = ref [] in
  let lcats = ref [] in
  ( try
      while true do
        let line = input_line f in
        if line <> "" && line.[0] <> '#' then (
          let fields = Array.of_list (Str.split_delim semi_re line) in
          if Array.length fields <> 15 then 
            failwith ("bad data line " ^ string_of_int !lnr);
          let code =
            int_of_string ("0x" ^ fields.(0)) in
          let cano_class =
            int_of_string fields.(3) in
          if cano_class <> 0 then
            cano_classes := (code, cano_class) :: !cano_classes;
          let decomp = fields.(5) in
          let is_compat = decomp <> "" && decomp.[0] = '<' in
          let decomp_words1 = Str.split space_re decomp in
          let decomp_words2 =
            List.filter (fun s -> s.[0] <> '<') decomp_words1 in
          let decomp_chars =
            List.map (fun s -> int_of_string ("0x" ^ s)) decomp_words2 in
          if decomp_chars <> [] then
            decomps := (code, is_compat, decomp_chars) :: !decomps;
          let bidi_cat = fields.(4) in
          if bidi_cat = "R" || bidi_cat = "AL" then (
            match !randalcats with
              | (c0, c1) :: l' when c1 = code-1 ->
                   randalcats := (c0, code) :: l'
              | _ -> 
                   randalcats := (code,code) :: !randalcats
          )
          else if bidi_cat = "L" then (
            match !lcats with
              | (c0, c1) :: l' when c1 = code-1 ->
                   lcats := (c0, code) :: l'
              | _ ->
                   lcats := (code,code) :: !lcats
          )
        );
        incr lnr
      done
    with End_of_file -> ()
  );
  cano_classes :=
    List.sort (fun (code1,c1) (code2,c2) -> c1 - c2 ||| code1-code2) !cano_classes;
  let out_classes = ref [] in
  let last_class = ref 0 in
  List.iter
    (fun (code,cls) ->
       if cls <> !last_class then
         out_classes := (-cls) :: !out_classes;
       out_classes := code :: !out_classes;
       last_class := cls;
    )
    !cano_classes;
  printf "let cano_classes =\n  [| ";
  print_array (List.rev !out_classes);
  printf " |]\n\n%!";
  let out_decomps = ref [] in
  List.iter
    (fun (code, is_compat, decomp_chars) ->
       let out_code = (code lsl 1) lor (if is_compat then 1 else 0) in
       out_decomps := List.rev decomp_chars @ [ -out_code ] @ !out_decomps;
    )
    (List.rev !decomps);
  printf "let decompositions =\n  [| ";
  print_array (List.rev !out_decomps);
  printf " |]\n\n%!";
  printf "let randalcat =\n  [| ";
  print_pairs (List.rev !randalcats);
  printf " |]\n\n%!";
  printf "let lcat =\n  [| ";
  print_pairs (List.rev !lcats);
  printf " |]\n\n%!";
  ()

let map_to_nothing_tab =
  (* RFC 3454, B.1 *)
  [ 0x00AD;
    0x034F;
    0x1806;
    0x180B;
    0x180C;
    0x180D;
    0x200B;
    0x200C;
    0x200D;
    0x2060;
    0xFE00;
    0xFE01;
    0xFE02;
    0xFE03;
    0xFE04;
    0xFE05;
    0xFE06;
    0xFE07;
    0xFE08;
    0xFE09;
    0xFE0A;
    0xFE0B;
    0xFE0C;
    0xFE0D;
    0xFE0E;
    0xFE0F;
    0xFEFF;
  ]

let map_to_nothing() =
  printf "let map_to_nothing =\n  [| ";
  print_array map_to_nothing_tab;
  printf " |]\n\n%!"


let map_to_space_tab =
  (* RFC 3454, C.1.2 *)
  [ 0x00A0;
    0x1680;
    0x2000;
    0x2001;
    0x2002;
    0x2003;
    0x2004;
    0x2005;
    0x2006;
    0x2007;
    0x2008;
    0x2009;
    0x200A;
    0x200B;
    0x202F;
    0x205F;
    0x3000;
  ]

let map_to_space() =
  printf "let map_to_space =\n  [| ";
  print_array map_to_space_tab;
  printf " |]\n\n%!"

let forbidden_tab =
  [ (* RFC 3454, C.2.2: Control *)
    0x0000, 0x001f;
    0x007f, 0x007f;
    0x0080, 0x009f;
    0x06DD, 0x06DD;
    0x070F, 0x070F;
    0x180E, 0x180E;
    0x200C, 0x200D;
    0x2028, 0x2029;
    0x2060, 0x2063;
    0x206A, 0x206F;
    0xFEFF, 0xFEFF;
    0xFFF9, 0xFFFC;
    0x1D173, 0x1D17A;
    (* C.3: Private Use *)
    0xE000, 0xF8FF;
    0xF0000, 0xFFFFD;
    0x100000, 0x10FFFD;
    (* C.4: non-characters *)
    0xFDD0, 0xFDEF;
    0xFFFE, 0xFFFF;
    0x1FFFE, 0x1FFFF;
    0x2FFFE, 0x2FFFF;
    0x3FFFE, 0x3FFFF;
    0x4FFFE, 0x4FFFF;
    0x5FFFE, 0x5FFFF;
    0x6FFFE, 0x6FFFF;
    0x7FFFE, 0x7FFFF;
    0x8FFFE, 0x8FFFF;
    0x9FFFE, 0x9FFFF;
    0xAFFFE, 0xAFFFF;
    0xBFFFE, 0xBFFFF;
    0xCFFFE, 0xCFFFF;
    0xDFFFE, 0xDFFFF;
    0xEFFFE, 0xEFFFF;
    0xFFFFE, 0xFFFFF;
    0x10FFFE, 0x10FFFF; 
    (* C.5. surrogate pairs *)
    0xD800, 0xDFFF;
    (* C.6 Inappropriate for plain text *)
    0xFFF9, 0xFFFD;
    (* C.7 Inappropriate for canonical representation *)
    0x2FF0, 0x2FFB;
    (* C.8 Change display properties or are deprecated *)
    0x0340, 0x0341;
    0x200E, 0x200F;
    0x202A, 0x202E;
    0x206A, 0x206F;
    (* C.9 Tagging characters *)
    0xE0001, 0xE0001;
    0xE0020, 0xE007F;
  ]


let forbidden() =
  printf "let forbidden =\n  [| ";
  print_pairs forbidden_tab;
  printf " |]\n\n%!"


let() =
  printf "(* Generated file! *)\n";
  exclusions();
  decompositions();
  map_to_nothing();
  map_to_space();
  forbidden()




  

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml