(*
* <COPYRIGHT>
* Copyright 2002 Joachim Schrod Network and Publication Consultance GmbH, Gerd Stolpmann
*
* <GPL>
* This file is part of WDialog.
*
* WDialog is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* WDialog is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with WDialog; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
* </>
*)
(* $Id: wd_encoding.ml,v 3.6 2004-03-30 22:16:37 stolpmann Exp $
* ----------------------------------------------------------------------
*
*)
let subst rex f s =
let matches = try Pcre.exec_all ~rex s with Not_found -> [||] in
let substitutes =
Array.map (fun m ->
let (pos,pos') = Pcre.get_substring_ofs m 0 in
f pos (Pcre.get_substring m 0)) matches in
let newlen = ref (String.length s) in
Array.iteri
(fun i m ->
let sb = substitutes.(i) in
let p0,p1 = Pcre.get_substring_ofs m 0 in
newlen := !newlen - (p1-p0) + String.length sb
)
matches;
let s' = String.create !newlen in
let k = ref 0 in (* position in s *)
let k' = ref 0 in (* position in s' *)
Array.iteri
(fun i m ->
let sb = substitutes.(i) in
let p0,p1 = Pcre.get_substring_ofs m 0 in
String.blit s !k s' !k' (p0 - !k);
k' := !k' + (p0 - !k);
String.blit sb 0 s' !k' (String.length sb);
k' := !k' + (String.length sb);
k := p1;
)
matches;
String.blit s !k s' !k' (String.length s - !k);
assert(!k' + String.length s - !k = String.length s');
s'
;;
let encode_as_html_re = (Pcre.regexp "[\\<\\>\\&\\\"]");;
let encode_as_html =
subst
encode_as_html_re
(fun pos ->
function
"<" -> "<"
| ">" -> ">"
| "&" -> "&"
| "\"" -> """
| s -> "&#" ^ string_of_int (Char.code s.[0]) ^ ";"
(* This case is actually not used! *)
)
;;
let repeat n s =
(* Repeat the string [s] [n] times *)
let l = String.length s in
let s' = String.create (l * n) in
for k = 0 to n-1 do
String.blit s 0 s' (k*l) l
done;
s'
;;
let encode_as_pre_re = Pcre.regexp "[ \r\n\t]";;
let encode_as_pre =
let last_nl = ref 0 in
subst
encode_as_pre_re
(fun pos ->
function
" " -> " "
| "\r" -> "" (* simply drop *)
| "\n" -> last_nl := pos+1; "<br>"
| "\t" ->
let spaces = 8 - (pos - !last_nl) mod 8 in
last_nl := pos+1;
repeat spaces " "
| _ ->
assert false
)
;;
let encode_as_para_re = Pcre.regexp "\r?\n(\r?\n)+";;
let encode_as_para s =
"<p>" ^
subst
encode_as_para_re
(fun pos _ -> "<p>")
s
;;
let encode_as_js_string_re =
(Pcre.regexp "\\\\|\\'|\\\"|\\<|\\%|\\x00|[\001-\031]|\127");;
let encode_as_js_string =
subst
encode_as_js_string_re
(fun pos s ->
match s with
"\\" -> "\\\\"
| "\"" -> "\\\""
| "'" -> "\\'"
| _ ->
let c = s.[0] in
Printf.sprintf "\\x%02x" (Char.code c)
)
;;
type pair =
Term of string
| Pair of pair * pair
;;
let encode_as_js_longstring ~(enc : Pxp_types.rep_encoding) s =
let utf8_hack = (enc = `Enc_utf8) in
(* For UTF8: avoid that multi-byte characters are broken up *)
let maximum = 80 in (* maximum line length in bytes *)
let rec find_char_boundary k =
if utf8_hack && k < String.length s &&
Char.code s.[k] >= 128 && Char.code s.[k] < 192
then
find_char_boundary (k+1)
(* The byte at position k is not the first byte of a multi-byte
* character. Try the next byte.
*)
else
k
in
let rec divide startpos endpos =
if endpos - startpos <= maximum then begin
let t = encode_as_js_string (String.sub s startpos (endpos-startpos)) in
if (String.length t) > maximum then begin
let m = find_char_boundary ((startpos + endpos) asr 1) in
Pair(divide startpos m, divide m endpos)
end
else
Term t
end
else
let m = find_char_boundary ((startpos + endpos) asr 1) in
Pair(divide startpos m, divide m endpos)
in
let rec total_length =
function
Term t -> String.length t
| Pair(p1,p2) -> total_length p1 + total_length p2 + 4
in
let rec join s' pos =
function
Term t ->
String.blit t 0 s' !pos (String.length t);
pos := !pos + (String.length t)
| Pair(p1,p2) ->
join s' pos p1;
s'.[ !pos ] <- '"';
s'.[ !pos+1 ] <- '+';
s'.[ !pos+2 ] <- '\n';
s'.[ !pos+3 ] <- '"';
pos := !pos + 4;
join s' pos p2;
in
let expr = divide 0 (String.length s) in
let l = total_length expr in
let s' = String.create l in
let pos = ref 0 in
join s' pos expr;
assert (!pos = l);
s'
;;
(* ======================================================================
* History:
*
* $Log: wd_encoding.ml,v $
* Revision 3.6 2004-03-30 22:16:37 stolpmann
* Bugfix in encode_as_pre (TAB expansion was wrong)
*
* Revision 3.5 2003/03/21 12:50:31 stolpmann
* Fix: encode_as_js_longstring can cope with UTF8-encoded strings
*
* Revision 3.4 2003/03/08 17:46:48 stolpmann
* Fixed bug with zero byte in a Pcre pattern
*
* Revision 3.3 2003/02/21 21:21:43 stolpmann
* js encoding: Do not escape characters >= 128 because we
* do not know the character set.
*
* Revision 3.2 2002/02/14 16:15:21 stolpmann
* Added copyright notice.
*
* Revision 3.1 2002/02/12 20:29:18 stolpmann
* Initial release at sourceforge.
*
* Revision 2.2 2002/01/30 15:12:49 gerd
* HTML encoding: no longer generates entities for characters
* >= 128. The advantage is that the function now works for both
* `Enc_iso88591 and `Enc_utf8.
*
* Revision 2.1 2002/01/28 02:12:36 gerd
* Initial revision.
*
*
*)