Plasma GitLab Archive
Projects Blog Knowledge

(*
 * <COPYRIGHT>
 * Copyright 2002 Joachim Schrod Network and Publication Consultance GmbH, Gerd Stolpmann
 *
 * <GPL>
 * This file is part of WDialog.
 *
 * WDialog is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * WDialog is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with WDialog; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 * </>
 *)

(* $Id: wd_encoding.ml,v 3.6 2004-03-30 22:16:37 stolpmann Exp $
 * ----------------------------------------------------------------------
 *
 *)

let subst rex f s =
  let matches = try Pcre.exec_all ~rex s with Not_found -> [||] in
  let substitutes =
    Array.map (fun m ->
		 let (pos,pos') = Pcre.get_substring_ofs m 0 in
		 f pos (Pcre.get_substring m 0)) matches in
  let newlen = ref (String.length s) in
  Array.iteri
    (fun i m ->
       let sb = substitutes.(i) in
       let p0,p1 = Pcre.get_substring_ofs m 0 in
       newlen := !newlen - (p1-p0) + String.length sb
    )
    matches;
  let s' = String.create !newlen in
  let k = ref 0 in    (* position in s *)
  let k' = ref 0 in   (* position in s' *)
  Array.iteri
    (fun i m ->
       let sb = substitutes.(i) in
       let p0,p1 = Pcre.get_substring_ofs m 0 in
       String.blit s !k s' !k' (p0 - !k);
       k' := !k' + (p0 - !k);
       String.blit sb 0 s' !k' (String.length sb);
       k' := !k' + (String.length sb);
       k := p1;
    )
    matches;
  String.blit s !k s' !k' (String.length s - !k);
  assert(!k' + String.length s - !k = String.length s');
  s'
;;


let encode_as_html_re = (Pcre.regexp "[\\<\\>\\&\\\"]");;

let encode_as_html =
  subst
    encode_as_html_re
    (fun pos ->
       function
	   "<"  -> "&lt;"
	 | ">"  -> "&gt;"
	 | "&"  -> "&amp;"
	 | "\"" -> "&quot;"
	 | s    -> "&#" ^ string_of_int (Char.code s.[0]) ^ ";" 
	     (* This case is actually not used! *)
    )
;;

let repeat n s =
  (* Repeat the string [s] [n] times *)
  let l = String.length s in
  let s' = String.create (l * n) in
  for k = 0 to n-1 do
    String.blit s 0 s' (k*l) l
  done;
  s'
;;

let encode_as_pre_re = Pcre.regexp "[ \r\n\t]";;

let encode_as_pre =
  let last_nl = ref 0 in
  subst
    encode_as_pre_re
    (fun pos ->
       function
	   " "  -> "&nbsp;"
	 | "\r" -> ""   (* simply drop *)
	 | "\n" -> last_nl := pos+1; "<br>"
	 | "\t" ->
	     let spaces = 8 - (pos - !last_nl) mod 8 in
	     last_nl := pos+1;
	     repeat spaces "&nbsp;"
	 | _ ->
	     assert false
    )
;;

let encode_as_para_re = Pcre.regexp "\r?\n(\r?\n)+";;

let encode_as_para s =
  "<p>" ^
  subst
    encode_as_para_re
    (fun pos _ -> "<p>")
    s
;;

let encode_as_js_string_re =
  (Pcre.regexp "\\\\|\\'|\\\"|\\<|\\%|\\x00|[\001-\031]|\127");;

let encode_as_js_string =
  subst
    encode_as_js_string_re
    (fun pos s ->
       match s with
           "\\" -> "\\\\"
         | "\"" -> "\\\""
	 | "'" -> "\\'"
	 | _ ->
	     let c = s.[0] in
	       Printf.sprintf "\\x%02x" (Char.code c)
    )
;;

type pair =
    Term of string
  | Pair of pair * pair
;;

let encode_as_js_longstring ~(enc : Pxp_types.rep_encoding) s =

  let utf8_hack = (enc = `Enc_utf8) in
  (* For UTF8: avoid that multi-byte characters are broken up *)

  let maximum = 80 in   (* maximum line length in bytes *)

  let rec find_char_boundary k =
    if utf8_hack && k < String.length s &&
       Char.code s.[k] >= 128 && Char.code s.[k] < 192
    then
      find_char_boundary (k+1)
	(* The byte at position k is not the first byte of a multi-byte
	 * character. Try the next byte.
	 *)
    else
      k
  in

  let rec divide startpos endpos =
    if endpos - startpos <= maximum then begin
      let t = encode_as_js_string (String.sub s startpos (endpos-startpos)) in
      if (String.length t) > maximum then begin
	let m = find_char_boundary ((startpos + endpos) asr 1) in
	Pair(divide startpos m, divide m endpos)
      end
      else
	Term t
    end
    else
      let m = find_char_boundary ((startpos + endpos) asr 1) in
      Pair(divide startpos m, divide m endpos)
  in

  let rec total_length =
    function
	Term t -> String.length t
      | Pair(p1,p2) -> total_length p1 + total_length p2 + 4
  in

  let rec join s' pos =
    function
	Term t ->
	  String.blit t 0 s' !pos (String.length t);
	  pos := !pos + (String.length t)
      | Pair(p1,p2) ->
	  join s' pos p1;
	  s'.[ !pos ] <- '"';
	  s'.[ !pos+1 ] <- '+';
	  s'.[ !pos+2 ] <- '\n';
	  s'.[ !pos+3 ] <- '"';
	  pos := !pos + 4;
	  join s' pos p2;
  in

  let expr = divide 0 (String.length s) in
  let l = total_length expr in
  let s' = String.create l in
  let pos = ref 0 in
  join s' pos expr;
  assert (!pos = l);
  s'
;;



(* ======================================================================
 * History:
 *
 * $Log: wd_encoding.ml,v $
 * Revision 3.6  2004-03-30 22:16:37  stolpmann
 * 	Bugfix in encode_as_pre (TAB expansion was wrong)
 *
 * Revision 3.5  2003/03/21 12:50:31  stolpmann
 * 	Fix: encode_as_js_longstring can cope with UTF8-encoded strings
 *
 * Revision 3.4  2003/03/08 17:46:48  stolpmann
 * 	Fixed bug with zero byte in a Pcre pattern
 *
 * Revision 3.3  2003/02/21 21:21:43  stolpmann
 * 	js encoding: Do not escape characters >= 128 because we
 * do not know the character set.
 *
 * Revision 3.2  2002/02/14 16:15:21  stolpmann
 * 	Added copyright notice.
 *
 * Revision 3.1  2002/02/12 20:29:18  stolpmann
 * 	Initial release at sourceforge.
 *
 * Revision 2.2  2002/01/30 15:12:49  gerd
 * 	HTML encoding: no longer generates entities for characters
 * >= 128. The advantage is that the function now works for both
 * `Enc_iso88591 and `Enc_utf8.
 *
 * Revision 2.1  2002/01/28 02:12:36  gerd
 * 	Initial revision.
 *
 *
 *)

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml