(* $Id: netencoding.ml 1998 2014-08-24 20:41:09Z gerd $
* ----------------------------------------------------------------------
*
*)
module Base64 = struct
let b64_pattern plus slash =
[| 'A'; 'B'; 'C'; 'D'; 'E'; 'F'; 'G'; 'H'; 'I'; 'J'; 'K'; 'L'; 'M';
'N'; 'O'; 'P'; 'Q'; 'R'; 'S'; 'T'; 'U'; 'V'; 'W'; 'X'; 'Y'; 'Z';
'a'; 'b'; 'c'; 'd'; 'e'; 'f'; 'g'; 'h'; 'i'; 'j'; 'k'; 'l'; 'm';
'n'; 'o'; 'p'; 'q'; 'r'; 's'; 't'; 'u'; 'v'; 'w'; 'x'; 'y'; 'z';
'0'; '1'; '2'; '3'; '4'; '5'; '6'; '7'; '8'; '9'; plus; slash |];;
let rfc_pattern = b64_pattern '+' '/';;
let url_pattern = b64_pattern '-' '/';;
let encode_with_options b64 equal s pos len
linelen first_linelen crlf =
(* encode using "base64".
* 'b64': The encoding table, created by b64_pattern.
* 'equal': The character that should be used instead of '=' in the original
* encoding scheme. Pass '=' to get the original encoding scheme.
* s, pos, len, linelen: See the interface description of encode_substring.
* first_linelen: The length of the first line.
*
* Returns: (s,last_linelen) where [s] is the encoded string, and
* [last_linelen] is the length of the last line
*)
assert (Array.length b64 = 64);
if len < 0 || pos < 0 || pos > String.length s || linelen < 0 then
invalid_arg "Netencoding.Base64.encode";
if pos + len > String.length s then
invalid_arg "Netencoding.Base64.encode";
let linelen = (linelen asr 2) lsl 2 in
let first_linelen = (first_linelen asr 2) lsl 2 in
let l_t = if len = 0 then 0 else ((len - 1) / 3 + 1) * 4 in
(* l_t: length of the result without additional line endings *)
let factor = if crlf then 2 else 1 in
let l_t' =
if linelen < 4 then
l_t
else
if l_t <= first_linelen then
( if l_t = 0 then 0 else l_t + factor )
else
let n_lines = ((l_t - first_linelen - 1) / linelen) + 2 in
l_t + n_lines * factor
in
(* l_t': length of the result with CRLF or LF characters *)
let t = String.make l_t' equal in
let j = ref 0 in
let q = ref (linelen - first_linelen) in
for k = 0 to len / 3 - 1 do
let p = pos + 3*k in
(* p >= pos >= 0: this is evident
* p+2 < pos+len <= String.length s:
* Because k <= len/3-1
* 3*k <= 3*(len/3-1) = len - 3
* pos+3*k+2 <= pos + len - 3 + 2 = pos + len - 1 < pos + len
* So it is proved that the following unsafe string accesses always
* work.
*)
let bits = (Char.code (String.unsafe_get s (p)) lsl 16) lor
(Char.code (String.unsafe_get s (p+1)) lsl 8) lor
(Char.code (String.unsafe_get s (p+2))) in
(* Obviously, 'bits' is a 24 bit entity (i.e. bits < 2**24) *)
assert(!j + 3 < l_t');
String.unsafe_set t !j (Array.unsafe_get b64 ( bits lsr 18));
String.unsafe_set t (!j+1) (Array.unsafe_get b64 ((bits lsr 12) land 63));
String.unsafe_set t (!j+2) (Array.unsafe_get b64 ((bits lsr 6) land 63));
String.unsafe_set t (!j+3) (Array.unsafe_get b64 ( bits land 63));
j := !j + 4;
if linelen > 3 then begin
q := !q + 4;
if !q + 4 > linelen then begin
(* The next 4 characters won't fit on the current line. So insert
* a line ending.
*)
if crlf then begin
t.[ !j ] <- '\013';
t.[ !j+1 ] <- '\010';
j := !j + 2;
end
else begin
t.[ !j ] <- '\010';
incr j
end;
q := 0;
end;
end;
done;
(* padding if needed: *)
let m = len mod 3 in
begin
match m with
0 -> ()
| 1 ->
let bits = Char.code (s.[pos + len - 1]) in
t.[ !j ] <- b64.( bits lsr 2);
t.[ !j + 1 ] <- b64.( (bits land 0x03) lsl 4);
j := !j + 4;
q := !q + 4;
| 2 ->
let bits = (Char.code (s.[pos + len - 2]) lsl 8) lor
(Char.code (s.[pos + len - 1])) in
t.[ !j ] <- b64.( bits lsr 10);
t.[ !j + 1 ] <- b64.((bits lsr 4) land 0x3f);
t.[ !j + 2 ] <- b64.((bits lsl 2) land 0x3f);
j := !j + 4;
q := !q + 4;
| _ -> assert false
end;
(* If required, add another line end: *)
if linelen > 3 && !q > 0 && len > 0 then begin
if crlf then begin
t.[ !j ] <- '\013';
t.[ !j+1 ] <- '\010';
j := !j + 2;
end
else begin
t.[ !j ] <- '\010';
incr j;
end;
end;
(t, !q) ;;
let encode ?(pos=0) ?len ?(linelength=0) ?(crlf=false) s =
let l = match len with None -> String.length s - pos | Some x -> x in
let s,_ =
encode_with_options rfc_pattern '=' s pos l linelength linelength crlf in
s
;;
let url_encode ?(pos=0) ?len ?(linelength=0) ?(crlf=false) s =
let l = match len with None -> String.length s - pos | Some x -> x in
let s,_ =
encode_with_options url_pattern '.' s pos l linelength linelength crlf in
s
;;
let encoding_pipe_conv ?(linelength = 0) ?(crlf = false) lastlen
incoming incoming_eof outgoing =
let linelength = (linelength asr 2) lsl 2 in
let len = Netbuffer.length incoming in
let len' =
if incoming_eof then
len
else
len - (len mod 3) (* only process a multiple of three characters *)
in
let (s,ll) =
encode_with_options
rfc_pattern '=' (Netbuffer.unsafe_buffer incoming) 0 len'
linelength (linelength - !lastlen) crlf
in
Netbuffer.delete incoming 0 len';
(* LF/CRLF: Unless s = "", s ends with a LF/CRLF. This is only right
* if ll = 0 or at EOF. In the other cases, this additional LF/CRLF
* must not be added to [outgoing].
*)
if linelength < 3 || ll=0 || s="" then begin
Netbuffer.add_string outgoing s;
end
else begin
let sl = String.length s in
assert(s.[sl-1] = '\n');
let sl' = if crlf then sl-2 else sl-1 in
Netbuffer.add_sub_string outgoing s 0 sl';
end;
lastlen := ll;
(* Ensure there is a LF/CRLF at the end: *)
if incoming_eof && linelength > 3 && ll > 0 then
Netbuffer.add_string outgoing (if crlf then "\r\n" else "\n");
(* TODO: Can be improved by using Netbuffer.add_inplace
*)
class encoding_pipe ?linelength ?crlf () =
let lastlen = ref 0 in
Netchannels.pipe ~conv:(encoding_pipe_conv ?linelength ?crlf lastlen) ()
let decode_prefix t pos len p_url p_spaces p_full p_null =
(* Decodes the prefix of a Base64-encoded string. Returns a triple
* (s,n,eof) where s is the decoded prefix, and n is the number of
* processed characters from t (i.e. the characters pos to pos+n-1 have
* been processed), and where eof is the boolean flag whether the
* padding '=' characters at the end of the string have been seen.
*
* p_url: accepts strings produced by url_endode
* p_spaces: accepts spaces in [t] (at the price of reduced speed)
* p_full: [t] must be a closed encoded string (i.e. no prefix)
* p_null: [t] must be an encoded null string
*)
if len < 0 || pos < 0 || pos > String.length t then
invalid_arg "Netencoding.Base64.decode";
if pos + len > String.length t then
invalid_arg "Netencoding.Base64.decode";
(* Compute the number of effective characters l_t in 't';
* pad_chars: number of '=' characters at the end of the string.
*)
let l_t, pad_chars =
if p_spaces then begin
(* Count all non-whitespace characters: *)
let c = ref 0 in
let p = ref 0 in
for i = pos to pos + len - 1 do
match String.unsafe_get t i with
(' '|'\t'|'\r'|'\n'|'>') -> ()
| ('='|'.') as ch ->
if ch = '.' && not p_url then
invalid_arg "Netencoding.Base64.decode";
incr c;
incr p;
if !p > 2 then
invalid_arg "Netencoding.Base64.decode";
for j = i+1 to pos + len - 1 do
match String.unsafe_get t j with
(' '|'\t'|'\r'|'\n'|'.'|'=') -> ()
| _ ->
(* Only another '=' or spaces allowed *)
invalid_arg "Netencoding.Base64.decode";
done
| _ -> incr c
done;
!c, !p
end
else
len,
( if len > 0 then (
if String.sub t (len - 2) 2 = "==" ||
(p_url && String.sub t (len - 2) 2 = "..") then 2
else
if String.sub t (len - 1) 1 = "=" ||
(p_url && String.sub t (len - 1) 1 = ".") then 1
else
0
)
else 0
)
in
if p_null && l_t <> 0 then invalid_arg "Netencoding.Base64.decode";
(* Compute the number of characters [l_t] that can be processed now
* (i.e. the effective prefix)
*)
let l_t, pad_chars =
let m = l_t mod 4 in
if m = 0 then (
(l_t, pad_chars) (* a multiple of 4 *)
) else (
if p_full then invalid_arg "Netencoding.Base64.decode";
(l_t - m, 0) (* rounded to a multiple of 4 *)
)
in
let l_s = (l_t / 4) * 3 - pad_chars in
let s = String.create l_s in
let decode_char c =
match c with
'A' .. 'Z' -> Char.code(c) - 65 (* 65 = Char.code 'A' *)
| 'a' .. 'z' -> Char.code(c) - 71 (* 71 = Char.code 'a' - 26 *)
| '0' .. '9' -> Char.code(c) + 4 (* -4 = Char.code '0' - 52 *)
| '+' -> 62
| '-' -> if not p_url then
invalid_arg "Netencoding.Base64.decode";
62
| '/' -> 63
| _ -> invalid_arg "Netencoding.Base64.decode";
in
(* Decode all but the last quartet: *)
let cursor = ref pos in
let rec next_char() =
match t.[ !cursor ] with
(' '|'\t'|'\r'|'\n'|'>') ->
if p_spaces then (incr cursor; next_char())
else invalid_arg "Netencoding.Base64.decode"
| c ->
incr cursor; c
in
if p_spaces then begin
for k = 0 to l_t / 4 - 2 do
let q = 3*k in
let c0 = next_char() in
let c1 = next_char() in
let c2 = next_char() in
let c3 = next_char() in
let n0 = decode_char c0 in
let n1 = decode_char c1 in
let n2 = decode_char c2 in
let n3 = decode_char c3 in
let x0 = (n0 lsl 2) lor (n1 lsr 4) in
let x1 = ((n1 lsl 4) land 0xf0) lor (n2 lsr 2) in
let x2 = ((n2 lsl 6) land 0xc0) lor n3 in
String.unsafe_set s q (Char.chr x0);
String.unsafe_set s (q+1) (Char.chr x1);
String.unsafe_set s (q+2) (Char.chr x2);
done;
end
else begin
(* Much faster: *)
for k = 0 to l_t / 4 - 2 do
let p = pos + 4*k in
let q = 3*k in
let c0 = String.unsafe_get t p in
let c1 = String.unsafe_get t (p + 1) in
let c2 = String.unsafe_get t (p + 2) in
let c3 = String.unsafe_get t (p + 3) in
let n0 = decode_char c0 in
let n1 = decode_char c1 in
let n2 = decode_char c2 in
let n3 = decode_char c3 in
let x0 = (n0 lsl 2) lor (n1 lsr 4) in
let x1 = ((n1 lsl 4) land 0xf0) lor (n2 lsr 2) in
let x2 = ((n2 lsl 6) land 0xc0) lor n3 in
String.unsafe_set s q (Char.chr x0);
String.unsafe_set s (q+1) (Char.chr x1);
String.unsafe_set s (q+2) (Char.chr x2);
done;
cursor := pos + l_t - 4;
end;
(* Decode the last quartet: *)
if l_t > 0 then begin
let q = 3*(l_t / 4 - 1) in
let c0 = next_char() in
let c1 = next_char() in
let c2 = next_char() in
let c3 = next_char() in
if (c2 = '=' && c3 = '=') || (p_url && c2 = '.' && c3 = '.') then begin
let n0 = decode_char c0 in
let n1 = decode_char c1 in
let x0 = (n0 lsl 2) lor (n1 lsr 4) in
s.[ q ] <- Char.chr x0;
end
else
if (c3 = '=') || (p_url && c3 = '.') then begin
let n0 = decode_char c0 in
let n1 = decode_char c1 in
let n2 = decode_char c2 in
let x0 = (n0 lsl 2) lor (n1 lsr 4) in
let x1 = ((n1 lsl 4) land 0xf0) lor (n2 lsr 2) in
s.[ q ] <- Char.chr x0;
s.[ q+1 ] <- Char.chr x1;
end
else begin
let n0 = decode_char c0 in
let n1 = decode_char c1 in
let n2 = decode_char c2 in
let n3 = decode_char c3 in
let x0 = (n0 lsl 2) lor (n1 lsr 4) in
let x1 = ((n1 lsl 4) land 0xf0) lor (n2 lsr 2) in
let x2 = ((n2 lsl 6) land 0xc0) lor n3 in
s.[ q ] <- Char.chr x0;
s.[ q+1 ] <- Char.chr x1;
s.[ q+2 ] <- Char.chr x2;
end
end
else cursor := 0;
(s, !cursor - pos, pad_chars > 0)
;;
let decode ?(pos=0) ?len ?(url_variant=true) ?(accept_spaces=false) s =
let l = match len with None -> String.length s - pos | Some x -> x in
let (s,_,_) = decode_prefix s pos l url_variant accept_spaces true false in
s
;;
(* TODO: Use Netbuffer.add_inplace instead of creating an intermediate
* string s in [decoding_pipe_conv].
*)
let decoding_pipe_conv url_variant accept_spaces padding_seen
incoming incoming_eof outgoing =
let len = Netbuffer.length incoming in
let t = Netbuffer.unsafe_buffer incoming in
if !padding_seen then begin
(* Only accept the null string: *)
let _,_,_ = decode_prefix t 0 len url_variant accept_spaces false true in
Netbuffer.clear incoming
end
else begin
let (s,n,ps) =
decode_prefix t 0 len url_variant accept_spaces incoming_eof false in
padding_seen := ps;
if incoming_eof then
Netbuffer.clear incoming
else
Netbuffer.delete incoming 0 n;
Netbuffer.add_string outgoing s
end;
class decoding_pipe ?(url_variant=true) ?(accept_spaces=false) () =
let padding_seen = ref false in
Netchannels.pipe
~conv:(decoding_pipe_conv url_variant accept_spaces padding_seen) ()
module Deprecated = struct
let encode_substring s ~pos ~len ~linelength ~crlf =
let s,_ =
encode_with_options
rfc_pattern '=' s pos len linelength linelength crlf in
s ;;
let decode_ignore_spaces s =
let (s,_,_) = decode_prefix s 0 (String.length s) true true true false in
s ;;
let decode_substring s ~pos ~len ~url_variant ~accept_spaces =
let (s,_,_) = decode_prefix s pos len url_variant accept_spaces true false
in
s ;;
end
end
module QuotedPrintable = struct
let encode_substring ?(crlf = true) ?(eot = false) ?(line_length = ref 0) s ~pos ~len =
(* line_length:
* - on input, the length of the line where the encoding starts
* - on output, the length of the last written line
* eot:
* - false: it is known that the chunk is not at the end of text
* - true: the chunk may be at the end of the text
* eot has only an effect on trailing spaces
*)
if len < 0 or pos < 0 or pos > String.length s then
invalid_arg "Netencoding.QuotedPrintable.encode";
if pos + len > String.length s then
invalid_arg "Netencoding.QuotedPrintable.encode";
let eol_len = if crlf then 2 else 1 in (* length of eol *)
(* Note: The [count] algorithm must strictly correspond to the
* "for" loop below.
*)
let rec count l n i =
(* l: output line length
* n: output byte count
* i: input byte count
*)
if i < len then
match String.unsafe_get s (pos+i) with
'\r' -> (* CR is deleted *)
count l n (i+1)
| '\n' -> (* LF may be expanded to CR/LF *)
count 0 (n+eol_len) (i+1)
| ('\000'..'\031'|'\127'..'\255'|
'!'|'"'|'#'|'$'|'@'|'['|']'|'^'|'\''|'{'|'|'|'}'|'~'|'=') ->
if l <= 69 then
count (l+3) (n+3) (i+1)
else
(* Add soft line break after the encoded char: *)
count 0 (n+4+eol_len) (i+1)
| 'F' when l=0 ->
(* Protect 'F' at the beginning of lines *)
count (l+3) (n+3) (i+1)
| ' ' when (i=len-1 && eot) || (* at end of text *)
l>69 || (* line too long *)
(i<len-1 && (s.[pos+i+1]='\r' || s.[pos+i+1]='\n'))
(* end of line *)
->
(* Protect spaces only if they occur at the end of a line,
* or just before soft line breaks
*)
if l <= 69 then
count (l+3) (n+3) (i+1)
else
(* Add soft line after the encoded space: *)
count 0 (n+4+eol_len) (i+1)
| _ ->
if l>71 then
(* Add soft line break after the char: *)
count 0 (n+2+eol_len) (i+1)
else
count (l+1) (n+1) (i+1)
else
n
in
let t_len = count !line_length 0 0 in
let t = String.create t_len in
let hexdigit =
[| '0'; '1'; '2'; '3'; '4'; '5'; '6'; '7';
'8'; '9'; 'A'; 'B'; 'C'; 'D'; 'E'; 'F'; |] in
let k = ref 0 in
let add_quoted c =
t.[ !k ] <- '=';
t.[ !k+1 ] <- hexdigit.( Char.code c lsr 4 );
t.[ !k+2 ] <- hexdigit.( Char.code c land 15 )
in
let add_soft_break() =
t.[ !k ] <- '=';
if crlf then (
t.[ !k+1 ] <- '\r';
t.[ !k+2 ] <- '\n';
)
else
t.[ !k+1 ] <- '\n';
in
(* In the following, the soft break criterion is [!l > 72]. Why?
* We need to be able to add at least an encoded char (3 bytes)
* plus the "=" sign for the soft break. So we are on the safe side
* when there are four bytes space on the line. Lines must not be
* longer than 76 chars (w/o CRLF), so 76-4=72.
*)
let l = ref !line_length in
for i = 0 to len - 1 do
match String.unsafe_get s i with
'\r' -> (* CR is deleted *)
()
| '\n' -> (* LF is expanded to CR/LF *)
if crlf then (
t.[ !k ] <- '\r';
t.[ !k+1 ] <- '\n';
k := !k + 2;
) else (
t.[ !k ] <- '\n';
k := !k + 1;
);
l := 0
| ('\000'..'\031'|'\127'..'\255'|
'!'|'"'|'#'|'$'|'@'|'['|']'|'^'|'\''|'{'|'|'|'}'|'~'|'=') as c ->
add_quoted c;
k := !k + 3;
l := !l + 3;
if !l > 72 then (
(* Add soft line break: *)
add_soft_break();
k := !k + 1 + eol_len;
l := 0
)
| 'F' when !l = 0 ->
(* Protect 'F' at the beginning of lines *)
add_quoted 'F';
k := !k + 3;
l := !l + 3;
| ' ' when ((i=len-1 && eot) ||
!l > 69 ||
(i<len-1 && (s.[pos+i+1]='\r' || s.[pos+i+1]='\n'))) ->
add_quoted ' ';
k := !k + 3;
l := !l + 3;
if !l > 72 then (
add_soft_break();
k := !k + 1 + eol_len;
l := 0;
)
| c ->
String.unsafe_set t !k c;
incr k;
incr l;
if !l > 72 then (
add_soft_break();
k := !k + 1 + eol_len;
l := 0;
)
done;
assert(!k == t_len);
line_length := !l;
t ;;
let encode ?crlf ?(pos=0) ?len s =
let l = match len with None -> String.length s - pos | Some x -> x in
encode_substring ?crlf ~eot:true s ~pos ~len:l;;
let encoding_pipe_conv ?crlf line_length incoming incoming_eof outgoing =
(* Problematic case: the incoming buffer ends with a space, but we are
* not at EOF. It is possible that a LF immediately follows, and that
* the space needs to be quoted.
* Solution: Do not convert such spaces, they remain in the buffer.
*)
let s = Netbuffer.unsafe_buffer incoming in
let len = Netbuffer.length incoming in
let (len',eot) =
if not incoming_eof && len > 0 && s.[len-1] = ' ' then
(len-1, false)
else
(len, true)
in
let s' = encode_substring ?crlf ~eot ~line_length s ~pos:0 ~len:len' in
Netbuffer.add_string outgoing s';
Netbuffer.delete incoming 0 len'
;;
class encoding_pipe ?crlf () =
let line_length = ref 0 in
Netchannels.pipe ~conv:(encoding_pipe_conv ?crlf line_length) ()
let decode_substring s ~pos ~len =
if len < 0 || pos < 0 || pos > String.length s then
invalid_arg "Netencoding.QuotedPrintable.decode";
if pos + len > String.length s then
invalid_arg "Netencoding.QuotedPrintable.decode";
let decode_hex c =
match c with
'0'..'9' -> Char.code c - 48
| 'A'..'F' -> Char.code c - 55
| 'a'..'f' -> Char.code c - 87
| _ ->
invalid_arg "Netencoding.QuotedPrintable.decode";
in
let rec count n i =
if i < len then
match String.unsafe_get s (pos+i) with
'=' ->
if i+1 = len then
(* A '=' at EOF is ignored *)
count n (i+1)
else
if i+1 < len then
match s.[pos+i+1] with
'\r' ->
(* Official soft break *)
if i+2 < len && s.[pos+i+2] = '\n' then
count n (i+3)
else
count n (i+2)
| '\n' ->
(* Inofficial soft break *)
count n (i+2)
| _ ->
if i+2 >= len then
invalid_arg
"Netencoding.QuotedPrintable.decode";
let _ = decode_hex s.[pos+i+1] in
let _ = decode_hex s.[pos+i+2] in
count (n+1) (i+3)
else
invalid_arg "Netencoding.QuotedPrintable.decode"
| _ ->
count (n+1) (i+1)
else
n
in
let l = count 0 0 in
let t = String.create l in
let k = ref pos in
let e = pos + len in
let i = ref 0 in
while !i < l do
match String.unsafe_get s !k with
'=' ->
if !k+1 = e then
(* A '=' at EOF is ignored *)
()
else
if !k+1 < e then
match s.[!k+1] with
'\r' ->
(* Official soft break *)
if !k+2 < e & s.[!k+2] = '\n' then
k := !k + 3
else
k := !k + 2
| '\n' ->
(* Inofficial soft break *)
k := !k + 2
| _ ->
if !k+2 >= e then
invalid_arg
"Netencoding.QuotedPrintable.decode_substring";
let x1 = decode_hex s.[!k+1] in
let x2 = decode_hex s.[!k+2] in
t.[ !i ] <- Char.chr ((x1 lsl 4) lor x2);
k := !k + 3;
incr i
else
invalid_arg "Netencoding.QuotedPrintable.decode_substring"
| c ->
String.unsafe_set t !i c;
incr k;
incr i
done;
t ;;
let decode ?(pos=0) ?len s =
let l = match len with None -> String.length s - pos | Some x -> x in
decode_substring s pos l;;
let decoding_pipe_conv incoming incoming_eof outgoing =
(* Problematic case: The incoming buffer ends with '=' or '=X'. In this
* case these characters remain in the buffer, because they will be
* completed to a full hex sequence by the next conversion call.
*)
let s = Netbuffer.unsafe_buffer incoming in
let len = Netbuffer.length incoming in
let len' =
if not incoming_eof then begin
if len > 0 && s.[len-1] = '=' then
len - 1
else
if len > 1 && s.[len-2] = '=' then
len - 2
else
len
end
else
len
in
let s' = decode ~len:len' s in
Netbuffer.add_string outgoing s';
Netbuffer.delete incoming 0 len'
;;
class decoding_pipe () =
Netchannels.pipe ~conv:decoding_pipe_conv ()
module Deprecated = struct
let encode_substring = encode_substring ?line_length:None
let decode_substring = decode_substring
end
end
module Q = struct
let encode_substring s ~pos ~len =
if len < 0 || pos < 0 || pos > String.length s then
invalid_arg "Netencoding.Q.encode_substring";
if pos + len > String.length s then
invalid_arg "Netencoding.Q.encode_substring";
let rec count n i =
if i < len then
match String.unsafe_get s (pos+i) with
| ('A'..'Z'|'a'..'z'|'0'..'9') ->
count (n+1) (i+1)
| _ ->
count (n+3) (i+1)
else
n
in
let l = count 0 0 in
let t = String.create l in
let hexdigit =
[| '0'; '1'; '2'; '3'; '4'; '5'; '6'; '7';
'8'; '9'; 'A'; 'B'; 'C'; 'D'; 'E'; 'F'; |] in
let k = ref 0 in
let add_quoted c =
t.[ !k ] <- '=';
t.[ !k+1 ] <- hexdigit.( Char.code c lsr 4 );
t.[ !k+2 ] <- hexdigit.( Char.code c land 15 )
in
for i = 0 to len - 1 do
match String.unsafe_get s i with
| ('A'..'Z'|'a'..'z'|'0'..'9') as c ->
String.unsafe_set t !k c;
incr k
| c ->
add_quoted c;
k := !k + 3
done;
t ;;
let encode ?(pos=0) ?len s =
let l = match len with None -> String.length s - pos | Some x -> x in
encode_substring s pos l;;
let decode_substring s ~pos ~len =
if len < 0 || pos < 0 || pos > String.length s then
invalid_arg "Netencoding.Q.decode_substring";
if pos + len > String.length s then
invalid_arg "Netencoding.Q.decode_substring";
let decode_hex c =
match c with
'0'..'9' -> Char.code c - 48
| 'A'..'F' -> Char.code c - 55
| 'a'..'f' -> Char.code c - 87
| _ ->
invalid_arg "Netencoding.Q.decode_substring";
in
let rec count n i =
if i < len then
match String.unsafe_get s (pos+i) with
'=' ->
if i+2 >= len then
invalid_arg "Netencoding.Q.decode_substring";
let _ = decode_hex s.[pos+i+1] in
let _ = decode_hex s.[pos+i+2] in
count (n+1) (i+3)
| _ -> (* including '_' *)
count (n+1) (i+1)
else
n
in
let l = count 0 0 in
let t = String.create l in
let k = ref pos in
let e = pos + len in
let i = ref 0 in
while !i < l do
match String.unsafe_get s !k with
'=' ->
if !k+2 >= e then
invalid_arg "Netencoding.Q.decode_substring";
let x1 = decode_hex s.[!k+1] in
let x2 = decode_hex s.[!k+2] in
t.[ !i ] <- Char.chr ((x1 lsl 4) lor x2);
k := !k + 3;
incr i
| '_' ->
String.unsafe_set t !i ' ';
incr k;
incr i
| c ->
String.unsafe_set t !i c;
incr k;
incr i
done;
t ;;
let decode ?(pos=0) ?len s =
let l = match len with None -> String.length s - pos | Some x -> x in
decode_substring s pos l ;;
module Deprecated = struct
let encode_substring = encode_substring
let decode_substring = decode_substring
end
end
module Url = struct
let hex_digits =
[| '0'; '1'; '2'; '3'; '4'; '5'; '6'; '7';
'8'; '9'; 'A'; 'B'; 'C'; 'D'; 'E'; 'F' |];;
let to_hex2 k =
(* Converts k to a 2-digit hex string *)
let s = String.create 2 in
s.[0] <- hex_digits.( (k lsr 4) land 15 );
s.[1] <- hex_digits.( k land 15 );
s ;;
let of_hex1 c =
match c with
('0'..'9') -> Char.code c - Char.code '0'
| ('A'..'F') -> Char.code c - Char.code 'A' + 10
| ('a'..'f') -> Char.code c - Char.code 'a' + 10
| _ ->
raise Not_found ;;
let url_encoding_re =
Netstring_str.regexp "[^A-Za-z0-9_.!*-]";;
let url_decoding_re =
Netstring_str.regexp "\\+\\|%..\\|%.\\|%";;
let encode ?(plus = true) s =
Netstring_str.global_substitute
url_encoding_re
(fun r _ ->
match Netstring_str.matched_string r s with
" " when plus -> "+"
| x ->
let k = Char.code(x.[0]) in
"%" ^ to_hex2 k
)
s ;;
let decode ?(plus = true) ?(pos=0) ?len s =
let s_l = String.length s in
let s1 =
if pos = 0 && len=None then s else
let len = match len with Some n -> n | None -> s_l in
String.sub s pos len in
let l = String.length s1 in
Netstring_str.global_substitute
url_decoding_re
(fun r _ ->
match Netstring_str.matched_string r s1 with
| "+" -> if plus then " " else "+"
| _ ->
let i = Netstring_str.match_beginning r in
(* Assertion: s1.[i] = '%' *)
if i+2 >= l then failwith "Netencoding.Url.decode";
let c1 = s1.[i+1] in
let c2 = s1.[i+2] in
begin
try
let k1 = of_hex1 c1 in
let k2 = of_hex1 c2 in
String.make 1 (Char.chr((k1 lsl 4) lor k2))
with
Not_found ->
failwith "Netencoding.Url.decode"
end
)
s1 ;;
let url_split_re =
Netstring_str.regexp "[&=]";;
let mk_url_encoded_parameters nv_pairs =
String.concat "&"
(List.map
(fun (name,value) ->
let name_encoded = encode name in
let value_encoded = encode value in
name_encoded ^ "=" ^ value_encoded
)
nv_pairs
)
;;
let dest_url_encoded_parameters parstr =
let rec parse_after_amp tl =
match tl with
Netstring_str.Text name :: Netstring_str.Delim "=" :: Netstring_str.Text value :: tl' ->
(decode name, decode value) :: parse_next tl'
| Netstring_str.Text name :: Netstring_str.Delim "=" :: Netstring_str.Delim "&" :: tl' ->
(decode name, "") :: parse_after_amp tl'
| Netstring_str.Text name :: Netstring_str.Delim "=" :: [] ->
[decode name, ""]
| _ ->
failwith "Netencoding.Url.dest_url_encoded_parameters"
and parse_next tl =
match tl with
[] -> []
| Netstring_str.Delim "&" :: tl' ->
parse_after_amp tl'
| _ ->
failwith "Netencoding.Url.dest_url_encoded_parameters"
in
let toklist = Netstring_str.full_split url_split_re parstr in
match toklist with
[] -> []
| _ -> parse_after_amp toklist
;;
let mk_url_encoded_parameters params =
String.concat "&"
(List.map (fun (name, value) -> encode name ^ "=" ^ encode value) params)
end
module Html = struct
let etable =
[ "lt", 60;
"gt", 62;
"amp", 38;
"quot", 34;
(* Note: " is new in HTML-4.0, but it has been widely used
* much earlier.
*)
"apos", 39;
(* Only used if contained in unsafe_chars *)
(* ISO-8859-1: *)
"nbsp", 160;
"iexcl", 161;
"cent", 162;
"pound", 163;
"curren", 164;
"yen", 165;
"brvbar", 166;
"sect", 167;
"uml", 168;
"copy", 169;
"ordf", 170;
"laquo", 171;
"not", 172;
"shy", 173;
"reg", 174;
"macr", 175;
"deg", 176;
"plusmn", 177;
"sup2", 178;
"sup3", 179;
"acute", 180;
"micro", 181;
"para", 182;
"middot", 183;
"cedil", 184;
"sup1", 185;
"ordm", 186;
"raquo", 187;
"frac14", 188;
"frac12", 189;
"frac34", 190;
"iquest", 191;
"Agrave", 192;
"Aacute", 193;
"Acirc", 194;
"Atilde", 195;
"Auml", 196;
"Aring", 197;
"AElig", 198;
"Ccedil", 199;
"Egrave", 200;
"Eacute", 201;
"Ecirc", 202;
"Euml", 203;
"Igrave", 204;
"Iacute", 205;
"Icirc", 206;
"Iuml", 207;
"ETH", 208;
"Ntilde", 209;
"Ograve", 210;
"Oacute", 211;
"Ocirc", 212;
"Otilde", 213;
"Ouml", 214;
"times", 215;
"Oslash", 216;
"Ugrave", 217;
"Uacute", 218;
"Ucirc", 219;
"Uuml", 220;
"Yacute", 221;
"THORN", 222;
"szlig", 223;
"agrave", 224;
"aacute", 225;
"acirc", 226;
"atilde", 227;
"auml", 228;
"aring", 229;
"aelig", 230;
"ccedil", 231;
"egrave", 232;
"eacute", 233;
"ecirc", 234;
"euml", 235;
"igrave", 236;
"iacute", 237;
"icirc", 238;
"iuml", 239;
"eth", 240;
"ntilde", 241;
"ograve", 242;
"oacute", 243;
"ocirc", 244;
"otilde", 245;
"ouml", 246;
"divide", 247;
"oslash", 248;
"ugrave", 249;
"uacute", 250;
"ucirc", 251;
"uuml", 252;
"yacute", 253;
"thorn", 254;
"yuml", 255;
(* Other: *)
"fnof", 402;
"Alpha", 913;
"Beta", 914;
"Gamma", 915;
"Delta", 916;
"Epsilon", 917;
"Zeta", 918;
"Eta", 919;
"Theta", 920;
"Iota", 921;
"Kappa", 922;
"Lambda", 923;
"Mu", 924;
"Nu", 925;
"Xi", 926;
"Omicron", 927;
"Pi", 928;
"Rho", 929;
"Sigma", 931;
"Tau", 932;
"Upsilon", 933;
"Phi", 934;
"Chi", 935;
"Psi", 936;
"Omega", 937;
"alpha", 945;
"beta", 946;
"gamma", 947;
"delta", 948;
"epsilon", 949;
"zeta", 950;
"eta", 951;
"theta", 952;
"iota", 953;
"kappa", 954;
"lambda", 955;
"mu", 956;
"nu", 957;
"xi", 958;
"omicron", 959;
"pi", 960;
"rho", 961;
"sigmaf", 962;
"sigma", 963;
"tau", 964;
"upsilon", 965;
"phi", 966;
"chi", 967;
"psi", 968;
"omega", 969;
"thetasym", 977;
"upsih", 978;
"piv", 982;
"bull", 8226;
"hellip", 8230;
"prime", 8242;
"Prime", 8243;
"oline", 8254;
"frasl", 8260;
"weierp", 8472;
"image", 8465;
"real", 8476;
"trade", 8482;
"alefsym", 8501;
"larr", 8592;
"uarr", 8593;
"rarr", 8594;
"darr", 8595;
"harr", 8596;
"crarr", 8629;
"lArr", 8656;
"uArr", 8657;
"rArr", 8658;
"dArr", 8659;
"hArr", 8660;
"forall", 8704;
"part", 8706;
"exist", 8707;
"empty", 8709;
"nabla", 8711;
"isin", 8712;
"notin", 8713;
"ni", 8715;
"prod", 8719;
"sum", 8721;
"minus", 8722;
"lowast", 8727;
"radic", 8730;
"prop", 8733;
"infin", 8734;
"ang", 8736;
"and", 8743;
"or", 8744;
"cap", 8745;
"cup", 8746;
"int", 8747;
"there4", 8756;
"sim", 8764;
"cong", 8773;
"asymp", 8776;
"ne", 8800;
"equiv", 8801;
"le", 8804;
"ge", 8805;
"sub", 8834;
"sup", 8835;
"nsub", 8836;
"sube", 8838;
"supe", 8839;
"oplus", 8853;
"otimes", 8855;
"perp", 8869;
"sdot", 8901;
"lceil", 8968;
"rceil", 8969;
"lfloor", 8970;
"rfloor", 8971;
"lang", 9001;
"rang", 9002;
"loz", 9674;
"spades", 9824;
"clubs", 9827;
"hearts", 9829;
"diams", 9830;
"OElig", 338;
"oelig", 339;
"Scaron", 352;
"scaron", 353;
"Yuml", 376;
"circ", 710;
"tilde", 732;
"ensp", 8194;
"emsp", 8195;
"thinsp", 8201;
"zwnj", 8204;
"zwj", 8205;
"lrm", 8206;
"rlm", 8207;
"ndash", 8211;
"mdash", 8212;
"lsquo", 8216;
"rsquo", 8217;
"sbquo", 8218;
"ldquo", 8220;
"rdquo", 8221;
"bdquo", 8222;
"dagger", 8224;
"Dagger", 8225;
"permil", 8240;
"lsaquo", 8249;
"rsaquo", 8250;
"euro", 8364;
] ;;
let quick_etable_html =
let ht = Hashtbl.create 50 in
List.iter (fun (name,value) ->
Hashtbl.add ht name value
)
etable;
ht ;;
let quick_etable_xml =
let ht = Hashtbl.create 5 in
List.iter (fun name ->
let value = List.assoc name etable in
Hashtbl.add ht name value
)
[ "lt"; "gt"; "amp"; "quot"; "apos"];
ht ;;
let rev_etable =
(* Only code points 0 to 255: *)
let a = Array.make 256 "" in
List.iter (fun (name,value) ->
if value <= 255 then
a.(value) <- "&" ^ name ^ ";"
) etable;
a ;;
let rev_etable_rest =
(* Only code points >= 256: *)
let ht = Hashtbl.create 150 in
List.iter (fun (name,value) ->
if value >= 256 then
Hashtbl.add ht value ("&" ^ name ^ ";")
) etable;
ht ;;
let unsafe_chars_html4 = "<>\"&\000\001\002\003\004\005\006\007\008\011\012\014\015\016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031\127" ;;
let regexp_ht = Hashtbl.create 7
let regexp_ht_mutex = !Netsys_oothr.provider # create_mutex()
let regexp_set s =
Netsys_oothr.serialize
regexp_ht_mutex
(fun () ->
try
Hashtbl.find regexp_ht s
with
| Not_found ->
let re = Netstring_str.regexp (Netstring_str.quote_set s) in
if Hashtbl.length regexp_ht < 100 then (* avoid leak *)
Hashtbl.replace regexp_ht s re;
re
)
()
(* The functions [encode_quickly] and [encode_ascii] are special cases of
* [encode] that can be implemented by regular expressions.
*)
let encode_quickly ~prefer_name ~unsafe_chars () =
(* Preconditions: in_enc = out_enc, and the encoding must be a single-byte,
* ASCII-compatible encoding.
*)
if unsafe_chars = "" then
(fun s -> s)
else
let unsafe_re =
regexp_set unsafe_chars in
Netstring_str.global_substitute
unsafe_re
(fun r s ->
let t = Netstring_str.matched_string r s in
let p = Char.code (t.[0]) in (* p is an ASCII code point *)
let name = rev_etable.(p) in
if prefer_name && name <> "" then
name
else
"&#" ^ string_of_int p ^ ";"
)
;;
let msb_set = (
let s = String.create 128 in
for k = 0 to 127 do s.[k] <- Char.chr (128+k) done;
s
)
let encode_ascii ~in_enc ~prefer_name ~unsafe_chars () =
(* Preconditions: out_enc = `Enc_usascii, and in_enc must be a single-byte,
* ASCII-compatible encoding.
*)
let unsafe_chars1 = unsafe_chars ^ msb_set in
let unsafe_re =
regexp_set unsafe_chars1 in
(* unicode_of.[q] = p: the code point q+128 of in_enc is the same as the
* Unicode code point p
*)
let unicode_of = Array.make 128 (-1) in
for i = 0 to 127 do
try
let s = String.make 1 (Char.chr (i+128)) in
let u = Netconversion.uarray_of_ustring in_enc s in
match u with
[| u0 |] -> unicode_of.(i) <- u0
| _ -> assert false
with
Netconversion.Malformed_code ->
unicode_of.(i) <- (-1)
done;
Netstring_str.global_substitute
unsafe_re
(fun r s ->
let t = Netstring_str.matched_string r s in
(* p is the code point in the encoding ~in_enc; p' is the Unicode
* code point:
*)
let p = Char.code (t.[0]) in
let p' = if p < 128 then p else unicode_of.(p - 128) in
if p' < 0 then raise Netconversion.Malformed_code;
let name =
if prefer_name then begin
if p' <= 255 then rev_etable.(p') else
try
Hashtbl.find rev_etable_rest p'
with
Not_found -> ""
end
else "" in
if name = "" then
"&#" ^ string_of_int p' ^ ";"
else
name
)
;;
let encode_from_latin1 = (* backwards compatible *)
encode_ascii
~in_enc:`Enc_iso88591 ~prefer_name:true ~unsafe_chars:unsafe_chars_html4
()
;;
let encode
~in_enc
?(out_enc = `Enc_usascii)
?(prefer_name = true)
?(unsafe_chars = unsafe_chars_html4)
() =
(* This function implements the general case *)
(* Check arguments: *)
if not (Netconversion.is_ascii_compatible out_enc) then
invalid_arg "Netencoding.Html.encode: out_enc not ASCII-compatible";
for i = 0 to String.length unsafe_chars - 1 do
if Char.code(unsafe_chars.[i]) >= 128 then
invalid_arg "Netencoding.Html.encode: non-ASCII character in unsafe_chars";
done;
(* Are there better implementations than the general one? *)
let in_single = Netconversion.is_single_byte in_enc in
let in_subset = match in_enc with `Enc_subset(_,_) -> true | _ -> false in
if not in_subset && in_enc=out_enc && in_single then
encode_quickly ~prefer_name ~unsafe_chars ()
else if not in_subset && out_enc=`Enc_usascii && in_single then
encode_ascii ~in_enc ~prefer_name ~unsafe_chars ()
else begin
(* ... only the general implementation is applicable. *)
(* Create the domain function: *)
let dom_array = Array.make 128 true in
let dom p = p >= 128 || dom_array.(p) in
(* Set dom_array from unsafe_chars: *)
for i = 0 to String.length unsafe_chars - 1 do
let c = Char.code(unsafe_chars.[i]) in
dom_array.(c) <- false
done;
(* Create the substitution function: *)
let subst p =
let name =
if prefer_name then begin
if p <= 255 then rev_etable.(p) else
try
Hashtbl.find rev_etable_rest p
with
Not_found -> ""
end
else "" in
if name = "" then
"&#" ^ string_of_int p ^ ";"
else
name
in
(* Recode: *)
Netconversion.recode_string
~in_enc ~out_enc:(`Enc_subset(out_enc,dom)) ~subst
end
;;
type entity_set = [ `Html | `Xml | `Empty ];;
let eref_re =
Netstring_str.regexp "&\\(\
#\\([0-9]+\\);\\|\
#[xX]\\([0-9a-fA-F]+\\);\\|\
\\([a-zA-Z]+\\);\
\\)" ;;
let total_enc =
(* every byte must have a corresponding Unicode code point, i.e. the
* encoding must be "byte-total"
*)
function
`Enc_iso88591
| `Enc_iso88592
| `Enc_iso88593
| `Enc_iso88594
| `Enc_iso88595
| `Enc_iso88599
| `Enc_iso885910
| `Enc_iso885913
| `Enc_iso885914
| `Enc_iso885915
| `Enc_iso885916 -> true
| _ -> false
;;
let hex_digit_of_char c =
match c with
'0'..'9' -> Char.code c - 48
| 'A'..'F' -> Char.code c - 55
| 'a'..'f' -> Char.code c - 87
| _ -> assert false
let hex_of_string s =
let n = ref 0 in
for i = 0 to String.length s - 1 do
let d = hex_digit_of_char s.[i] in
n := (!n lsl 4) lor d
done;
!n
let search_all re s pos =
let rec search p acc =
match
try Some(Netstring_str.search_forward re s p) with Not_found -> None
with
| Some (k,r) ->
search (k+1) ( (k,r) :: acc )
| None ->
List.rev acc in
search pos []
let decode
~in_enc
~out_enc
?(lookup=fun name ->
failwith ("Netencoding.Html.decode: Unknown entity `" ^ name ^ "'"))
?(subst=fun p ->
failwith ("Netencoding.Html.decode: Character cannot be represented: " ^ string_of_int p))
?(entity_base = (`Html : entity_set))
() =
(* Argument checks: *)
if not (Netconversion.is_ascii_compatible in_enc) then
invalid_arg "Netencoding.Html.decode: in_enc not ASCII-compatible";
(* makechar: *)
let raw_makechar = Netconversion.makechar out_enc in
let makechar p =
try raw_makechar p
with Not_found -> subst p
in
(* Entity lookup: *)
let lookup_entity =
match entity_base with
`Html
| `Xml ->
let ht =
if entity_base = `Html
then quick_etable_html
else quick_etable_xml in
( fun name ->
try
makechar(Hashtbl.find ht name)
with
Not_found -> lookup name
)
| `Empty ->
lookup
in
(* Recode strings: *)
let recode_str =
if total_enc in_enc && in_enc = out_enc then
(fun s -> s)
else
Netconversion.recode_string ~in_enc ~out_enc ~subst
in
(fun s ->
(* Find all occurrences of &name; or &#num; or &#xnum; *)
let occurrences = search_all eref_re s 0 in
(* Collect the resulting string in a buffer *)
let buf = Buffer.create 250 in
let n = ref 0 in
List.iter
(fun (n0,r) ->
let n1 = Netstring_str.match_end r in
if n0 > !n then
Buffer.add_string buf (recode_str (String.sub s !n (n0 - !n)));
(* TODO: avoid String.sub *)
let replacement =
let num =
try Netstring_str.matched_group r 2 s with Not_found -> "" in
(* Note: Older versions of Pcre return "" when the substring
* did not match, newer versions raise Not_found
*)
if num <> "" then begin
let n = int_of_string num in
makechar n
end
else begin
let xnum =
try Netstring_str.matched_group r 3 s with Not_found -> "" in
(* Note: Older versions of Pcre return "" when the substring
* did not match, newer versions raise Not_found
*)
if xnum <> "" then begin
let n = hex_of_string xnum in
makechar n
end
else begin
let name =
try Netstring_str.matched_group r 4 s with Not_found -> "" in
(* Note: Older versions of Pcre return "" when the substring
* did not match, newer versions raise Not_found
*)
assert(name <> "");
lookup_entity name
end
end
in
Buffer.add_string buf replacement;
n := n1;
)
occurrences;
let n0 = String.length s in
if n0 > !n then
Buffer.add_string buf (recode_str (String.sub s !n (n0 - !n)));
(* TODO: avoid String.sub *)
(* Return *)
Buffer.contents buf
)
;;
let decode_to_latin1 =
decode ~in_enc:`Enc_iso88591 ~out_enc:`Enc_iso88591
~lookup:(fun s -> "&" ^ s ^ ";")
~subst:(fun p -> "&#" ^ string_of_int p ^ ";")
()
end