(* $Id: netconversion.mli 1084 2007-02-20 12:36:17Z gerd $
* ----------------------------------------------------------------------
*)
(** Conversion between character encodings
*
* {b Contents}
* {ul
* {- {!Netconversion.preliminaries}
* {ul
* {- {!Netconversion.unicode}}
* {- {!Netconversion.subsets}}
* {- {!Netconversion.linking}}
* {- {!Netconversion.domain}}
* {- {!Netconversion.problems}}}}
* {- {!Netconversion.interface}
* {ul
* {- {!Netconversion.direct_conv}}
* {- {!Netconversion.cursors}
* {ul {- {!Netconversion.bom}}}}
* {- {!Netconversion.unicode_functions}}
* }
* }
* }
*)
(** {1:preliminaries Preliminaries}
*
* A {b character set} is a set of characters where every character is
* identified by a {b code point}. An {b encoding} is a way of
* representing characters from a set in byte strings. For example,
* the Unicode character set has more than 96000 characters, and
* the code points have values from 0 to 0x10ffff (not all code points
* are assigned yet). The UTF-8 encoding represents the code points
* by sequences of 1 to 4 bytes. There are also encodings that
* represent code points from several sets, e.g EUC-JP covers four
* sets.
*
* Encodings are enumerated by the type [encoding], and names follow
* the convention [`Enc_*], e.g. [`Enc_utf8].
* Character sets are enumerated by the type
* [charset], and names follow the convention [`Set_*], e.g.
* [`Set_unicode].
*
* This module deals mainly with encodings. It is important to know
* that the same character set may have several encodings. For example,
* the Unicode character set can be encoded as UTF-8 or UTF-16.
* For the 8 bit character sets, however, there is usually only one
* encoding, e.g [`Set_iso88591] is always encoded as [`Enc_iso88591].
*
* In a {b single-byte encoding} every code point is represented by
* one byte. This is what many programmers are accustomed at, and
* what the O'Caml language specially supports: A [string] is
* a sequence of [char]s, where [char] means an 8 bit quantity
* interpreted as character. For example, the following piece of code allocates
* a [string] of four [char]s, and assigns them individually:
*
* {[
* let s = String.create 4 in
* s.[0] <- 'G';
* s.[1] <- 'e';
* s.[2] <- 'r';
* s.[3] <- 'd';
* ]}
*
* In a {b multi-byte encoding} there are code points that are represented
* by several bytes. As we still represent such text as [string], the
* problem arises that a single [char], actually a byte, often represents
* only a fraction of a full multi-byte character. There are two solutions:
* - Give up the principle that text is represented by [string].
* This is, for example, the approach chosen by [Camomile], another O'Caml
* library dealing with Unicode. Instead, text is represented as
* [int array]. This way, the algorithms processing the text can
* remain the same.
* - Give up the principle that individual characters can be directly
* accessed in a text. This is the primary way chosen by Ocamlnet.
* This means that there is not any longer the possibility to read
* or write the [n]th character of a text. One can, however, still
* compose texts by just concatenating the strings representing
* individual characters. Furthermore, it is possible to define
* a cursor for a text that moves sequentially along the text.
* The consequence is that programmers are restricted to sequential
* algorithms. Note that the majority of text processing falls into
* this class.
*
* The corresponding piece of code for Ocamlnet's Unicode implementation
* is:
* {[
* let b = Buffer.create 80 in
* Buffer.add b (ustring_of_uchar `Enc_utf8 71); (* 71 = code point of 'G' *)
* Buffer.add b (ustring_of_uchar `Enc_utf8 101); (* 101 = code point of 'e' *)
* Buffer.add b (ustring_of_uchar `Enc_utf8 114); (* 114 = code point of 'r' *)
* Buffer.add b (ustring_of_uchar `Enc_utf8 100); (* 100 = code point of 'd' *)
* let s = Buffer.contents b
* ]}
*
* It is important to always remember that a [char] is no longer
* a character but simply a byte. In many of the following explanations,
* we strictly distinguish between {b byte positions} or {b byte counts},
* and {b character positions} or {b character counts}.
*
* There a number of special effects that usually only occur in
* multi-byte encodings:
*
* - Bad encodings: Not every byte sequence is legal. When scanning
* such text, the functions will raise the exception [Malformed_code]
* when they find illegal bytes.
* - Unassigned code points: It may happen that a byte sequence is
* a correct representation for a code point, but that the code point
* is unassigned in the character set. When scanning, this is also
* covered by the exception [Malformed_code]. When converting from
* one encoding to another, it is also possible that the code point
* is only unassigned in the target character set. This case is
* usually handled by a substitution function [subst], and if no such
* function is defined, by the exception [Cannot_represent].
* - Incomplete characters: The trailing bytes of a string may be the
* correct beginning of a byte sequence for a character, but not a
* complete sequence. Of course, if that string is the end of a
* text, this is just illegal, and also a case for [Malformed_code].
* However, when text is processed chunk by chunk, this phenomenon
* may happen legally for all chunks but the last. For this reason,
* some of the functions below handle this case specially.
* - Byte order marks: Some encodings have both big and little endian
* variants. A byte order mark at the beginning of the text declares
* which variant is actually used. This byte order mark is a
* declaration written like a character, but actually not a
* character.
*
* There is a special class of encodings known as {b ASCII-compatible}.
* They are important because there are lots of programs and protocols
* that only interpret bytes from 0 to 127, and treat the bytes from
* 128 to 255 as data. These programs can process texts as long as
* the bytes from 0 to 127 are used as in ASCII. Fortunately, many
* encodings are ASCII-compatible, including UTF-8.
*
* {2:unicode Unicode}
*
* [Netconversion] is centred around Unicode.
* The conversion from one encoding to another works by finding the
* Unicode code point of the character
* to convert, and by representing the code point in the target encoding,
* even if neither encodings have to do with Unicode.
* Of course, this approach requires that all character sets handled
* by [Netconversion] are subsets of Unicode.
*
* The supported range of Unicode code points: 0 to 0xd7ff, 0xe000 to 0xfffd,
* 0x10000 to 0x10ffff. All these code points can be represented in
* UTF-8 and UTF-16. [Netconversion] does not know which of the code
* points are assigned and which not, and because of this, it simply
* allows all code points of the mentioned ranges (but for other character
* sets, the necessary lookup tables exist).
*
* {b UTF-8:} The UTF-8 representation can have one to four bytes. Malformed
* byte sequences are always rejected, even those that want to cheat the
* reader like "0xc0 0x80" for the code point 0. There is special support
* for the Java variant of UTF-8 ([`Enc_java]). UTF-8 strings must not
* have a byte order mark (it would be interpreted as "zero-width space"
* character).
*
* {b UTF-16:} When reading from a string encoded as [`Enc_utf16], a byte
* order mark is expected at the beginning. The detected variant
* ([`Enc_utf16_le] or [`Enc_utf16_be]) is usually returned by the parsing
* function. The byte order mark is not included into the output string. -
* Some functions of this
* module cannot cope with [`Enc_utf16] (i.e. UTF-16 without endianess
* annotation), and will fail.
*
* Once the endianess is determined, the code point 0xfeff is no longer
* interpreted as byte order mark, but as "zero-width non-breakable space".
*
* Some code points are represented by pairs of 16 bit values, these
* are the so-called "surrogate pairs". They can only occur in UTF-16.
*
* {2:subsets Subsets of Unicode}
*
* The non-Unicode character sets are subsets of Unicode. Here, it may
* happen that a Unicode code point does not have a corresponding
* code point. In this case, certain rules are applied to handle
* this (see below). It is, however, ensured that every non-Unicode
* code point has a corresponding Unicode code point. (In other words,
* character sets cannot be supported for which this property does
* not hold.)
*
* It is even possible to create further subsets artificially. The
* encoding [`Enc_subset(e,def)] means to derive a new encoding from
* the existing one [e], but to only accept the code points for which
* the definition function [def] yields the value [true]. For example,
* the encoding
* {[ `Enc_subset(`Enc_usascii,
* fun i -> i <> 34 && i <> 38 && i <> 60 && i <> 62) ]}
* is ASCII without the bracket angles, the quotation mark, and the
* ampersand character, i.e. the subset of ASCII that can be included
* in HTML text without escaping.
*
* If a code point is not defined by the encoding but found in a text,
* the reader will raise the exception [Malformed_code]. When text is
* output, however, the [subst] function will be called for undefined code
* points (which raises [Cannot_represent] by default). The [subst]
* function is an optional argument of many conversion functions that
* allows it to insert a substitution text for undefined code points.
* Note, however, that the substitution text is restricted to at most
* 50 characters (because unlimited length would lead to difficult
* problems we would like to avoid).
*
* {2:linking Linking this module}
*
* Many encodings require lookup tables. The following encodings
* are built-in and always supported:
*
* - Unicode: [`Enc_utf8], [`Enc_java], [`Enc_utf16], [`Enc_utf16_le],
[`Enc_utf16_be]
* - Other: [`Enc_usascii], [`Enc_iso88591], [`Enc_empty]
*
* The lookup tables for the other encodings are usually loaded at
* runtime, but it is also possible to embed them in the generated
* binary executable. See the file [INSTALL] for details. The functions
* [available_input_encodings] and [available_output_encodings] can
* be invoked to find out which encodings can be loaded, or are available
* otherwise.
*
* {2:domain Supported Encodings, Restrictions}
*
* I took the mappings from [www.unicode.org], and the standard names of
* the character sets from IANA. Obviously, many character sets are missing
* that can be supported; especially ISO646 character sets, and many EBCDIC
* code pages. Stateful encodings like generic ISO-2022 have been omitted
* (stateless subsets of ISO-2022 like EUC can be supported, however;
* currently we support EUC-JP and EUC-KR).
*
* Because of the copyright statement from Unicode, I cannot put the
* source tables that describe the mappings into the distribution. They
* are publicly available from [www.unicode.org].
*
* {2:problems Known Problems}
*
* - The following charsets do not have a bijective mapping to Unicode:
* adobe_standard_encoding, adobe_symbol_encoding,
* adobe_zapf_dingbats_encoding, cp1002 (0xFEBE). The current implementation
* simply removes one of the conflicting code point pairs - this might
* not what you want.
* - Japanese encodings:
* JIS X 0208: The character 1/32 is mapped to 0xFF3C, and not
* to 0x005C.
*)
(** {1:interface Interface}
*
* {b Naming conventions:}
*
* As it is possible to refer to substrings by either giving a byte
* offset or by counting whole characters, these naming conventions
* are helpful:
*
* - Labels called [range_pos] and [range_len] refer to byte positions of
* characters, or substrings
* - Labels called [count] refer to positions given as the number of characters
* relative to an origin
*
* Furthermore:
*
* - A [uchar] is a single Unicode code point represented as int
* - A [ustring] is a string of encoded characters
* - A [uarray] is an [array of int] representing a string
*)
exception Malformed_code
(** Raised when an illegal byte sequence is found *)
exception Cannot_represent of int
(** Raised when a certain Unicode code point cannot be represented in
* the selected output encoding
*)
(** The polymorphic variant enumerating the supported encodings. We have:
* - [`Enc_utf8]: UTF-8
* - [`Enc_java]: The UTF-8 variant used by Java (the only difference is
* the representation of NUL)
* - [`Enc_utf16]: UTF-16 with unspecified endianess (restricted)
* - [`Enc_utf16_le]: UTF-16 little endian
* - [`Enc_utf16_be]: UTF-16 big endian
* - [`Enc_usascii]: US-ASCII (7 bits)
* - [`Enc_iso8859]{i n}: ISO-8859-{i n}
* - [`Enc_koi8r]: KOI8-R
* - [`Enc_jis0201]: JIS-X-0201 (Roman and Katakana)
* - [`Enc_eucjp]: EUC-JP (code points from US-ASCII, JIS-X-0202, -0208, and
* -0212)
* - [`Enc_euckr]: EUC-KR (code points from US-ASCII, KS-X-1001)
* - [`Enc_windows]{i n}: WINDOWS-{i n}
* - [`Enc_cp]{i n}: IBM code page {i n}. Note that there are both ASCII-
* and EBCDIC-based code pages
* - [`Enc_adobe_*]: Adobe-specific encodings, e.g. used in Adobe fonts
* - [`Enc_mac*]: Macintosh-specific encodings
* - [`Enc_subset(e,def)]: The subset of [e] by applying the definition
* function [def]
* - [`Enc_empty]: The empty encoding (does not represent any character)
*)
type encoding =
[ `Enc_utf8 (* UTF-8 *)
| `Enc_java (* The variant of UTF-8 used by Java *)
| `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *)
| `Enc_utf16_le (* UTF-16 little endian *)
| `Enc_utf16_be (* UTF-16 big endian *)
| `Enc_usascii (* US-ASCII (only 7 bit) *)
| `Enc_iso88591 (* ISO-8859-1 *)
| `Enc_iso88592 (* ISO-8859-2 *)
| `Enc_iso88593 (* ISO-8859-3 *)
| `Enc_iso88594 (* ISO-8859-4 *)
| `Enc_iso88595 (* ISO-8859-5 *)
| `Enc_iso88596 (* ISO-8859-6 *)
| `Enc_iso88597 (* ISO-8859-7 *)
| `Enc_iso88598 (* ISO-8859-8 *)
| `Enc_iso88599 (* ISO-8859-9 *)
| `Enc_iso885910 (* ISO-8859-10 *)
| `Enc_iso885911 (* ISO-8859-11 *)
| `Enc_iso885913 (* ISO-8859-13 *)
| `Enc_iso885914 (* ISO-8859-14 *)
| `Enc_iso885915 (* ISO-8859-15 *)
| `Enc_iso885916 (* ISO-8859-16 *)
| `Enc_koi8r (* KOI8-R *)
| `Enc_jis0201 (* JIS-X-0201 (Roman in lower half; Katakana upper half *)
| `Enc_eucjp (* EUC-JP (includes US-ASCII, JIS-X-0201, -0208, -0212) *)
(* Japanese, TODO: *)
(*| `Enc_iso2022jp of jis_state = [ `Enc_usascii | `Enc_jis0201 |
`Enc_jis0208_1978 | `Enc_jis0208_1893 ]
It is very likely that ISO-2022 will be handled in a different module.
This encoding is too weird.
| `Enc_sjis
*)
| `Enc_euckr (* EUC-KR (includes US-ASCII, KS-X-1001) *)
(* Microsoft: *)
| `Enc_windows1250 (* WINDOWS-1250 *)
| `Enc_windows1251 (* WINDOWS-1251 *)
| `Enc_windows1252 (* WINDOWS-1252 *)
| `Enc_windows1253 (* WINDOWS-1253 *)
| `Enc_windows1254 (* WINDOWS-1254 *)
| `Enc_windows1255 (* WINDOWS-1255 *)
| `Enc_windows1256 (* WINDOWS-1256 *)
| `Enc_windows1257 (* WINDOWS-1257 *)
| `Enc_windows1258 (* WINDOWS-1258 *)
(* IBM, ASCII-based: *)
| `Enc_cp437
| `Enc_cp737
| `Enc_cp775
| `Enc_cp850
| `Enc_cp852
| `Enc_cp855
| `Enc_cp856
| `Enc_cp857
| `Enc_cp860
| `Enc_cp861
| `Enc_cp862
| `Enc_cp863
| `Enc_cp864
| `Enc_cp865
| `Enc_cp866
| `Enc_cp869
| `Enc_cp874
| `Enc_cp1006
(* IBM, EBCDIC-based: *)
| `Enc_cp037
| `Enc_cp424
| `Enc_cp500
| `Enc_cp875
| `Enc_cp1026
| `Enc_cp1047
(* Adobe: *)
| `Enc_adobe_standard_encoding
| `Enc_adobe_symbol_encoding
| `Enc_adobe_zapf_dingbats_encoding
(* Apple: *)
| `Enc_macroman
(* Encoding subset: *)
| `Enc_subset of (encoding * (int -> bool))
| `Enc_empty (* does not encode any character *)
]
(** A [charset] is simply a set of code points. It does not say how
* the code points are encoded as bytes. Every encoding implies a certain
* charset (or several charsets) that can be encoded, but the reverse is
* not true.
*)
type charset =
[ `Set_unicode (* The full Unicode repertoire *)
| `Set_usascii (* US-ASCII (only 7 bit) *)
| `Set_iso88591 (* ISO-8859-1 *)
| `Set_iso88592 (* ISO-8859-2 *)
| `Set_iso88593 (* ISO-8859-3 *)
| `Set_iso88594 (* ISO-8859-4 *)
| `Set_iso88595 (* ISO-8859-5 *)
| `Set_iso88596 (* ISO-8859-6 *)
| `Set_iso88597 (* ISO-8859-7 *)
| `Set_iso88598 (* ISO-8859-8 *)
| `Set_iso88599 (* ISO-8859-9 *)
| `Set_iso885910 (* ISO-8859-10 *)
| `Set_iso885911 (* ISO-8859-11 *)
| `Set_iso885913 (* ISO-8859-13 *)
| `Set_iso885914 (* ISO-8859-14 *)
| `Set_iso885915 (* ISO-8859-15 *)
| `Set_iso885916 (* ISO-8859-16 *)
| `Set_koi8r (* KOI8-R *)
| `Set_jis0201 (* JIS-X-0201 *)
| `Set_jis0208 (* JIS-X-0208 *)
| `Set_jis0212 (* JIS-X-0212 *)
| `Set_ks1001 (* KS-X-1001 *)
(* Microsoft: *)
| `Set_windows1250 (* WINDOWS-1250 *)
| `Set_windows1251 (* WINDOWS-1251 *)
| `Set_windows1252 (* WINDOWS-1252 *)
| `Set_windows1253 (* WINDOWS-1253 *)
| `Set_windows1254 (* WINDOWS-1254 *)
| `Set_windows1255 (* WINDOWS-1255 *)
| `Set_windows1256 (* WINDOWS-1256 *)
| `Set_windows1257 (* WINDOWS-1257 *)
| `Set_windows1258 (* WINDOWS-1258 *)
(* IBM, ASCII-based: *)
| `Set_cp437
| `Set_cp737
| `Set_cp775
| `Set_cp850
| `Set_cp852
| `Set_cp855
| `Set_cp856
| `Set_cp857
| `Set_cp860
| `Set_cp861
| `Set_cp862
| `Set_cp863
| `Set_cp864
| `Set_cp865
| `Set_cp866
| `Set_cp869
| `Set_cp874
| `Set_cp1006
(* IBM, EBCDIC-based: *)
| `Set_cp037
| `Set_cp424
| `Set_cp500
| `Set_cp875
| `Set_cp1026
| `Set_cp1047
(* Adobe: *)
| `Set_adobe_standard_encoding
| `Set_adobe_symbol_encoding
| `Set_adobe_zapf_dingbats_encoding
(* Apple: *)
| `Set_macroman
]
(** {b Pre-evaluation of the encoding argument:}
*
* A number of the following functions can be made run faster if they are
* called several times for the same encoding. In this case, it is recommended
* to apply the function once partially with the encoding argument, and to
* call the resulting closure instead. For example, [ustring_of_uchar] supports
* this technique:
*
* {[
* let my_ustring_of_uchar = ustring_of_uchar my_enc in
* let s1 = my_ustring_of_uchar u1 ...
* let s2 = my_ustring_of_uchar u2 ... ]}
*
* This is {b much} faster than
*
* {[
* let s1 = ustring_of_uchar my_enc u1 ...
* let s2 = ustring_of_uchar my_enc u2 ... ]}
*
* The availability of this optimization is indicated by the predicate
* PRE_EVAL({i arg}) where {i arg} identifies the encoding argument.
*
* {b Inlining}
*
* When a function can be inlined across module/library boundaries,
* this is indicated by the predicate INLINED. Of course, this works
* only for the ocamlopt compiler.
*)
val encoding_of_string : string -> encoding;;
(** Returns the encoding of the name of the encoding. Fails if the
* encoding is unknown.
* E.g. [encoding_of_string "iso-8859-1" = `Enc_iso88591]
*
* Punctuation characters (e.g. "-") and year suffixes (e.g.
* ":1991") are ignored.
*)
val string_of_encoding : encoding -> string;;
(** Returns the name of the encoding. *)
val is_ascii_compatible : encoding -> bool;;
(** "ASCII compatible" means: The bytes 1 to 127 represent the ASCII
* codes 1 to 127, and no other representation of a character contains
* the bytes 1 to 127.
*
* For example, ISO-8859-1 is ASCII-compatible because the byte 1 to
* 127 mean the same as in ASCII, and all other characters use bytes
* greater than 127. UTF-8 is ASCII-compatible for the same reasons,
* it does not matter that there are multi-byte characters.
* EBCDIC is not ASCII-compatible because the bytes 1 to 127 do not mean
* the same as in ASCII. UTF-16 is not ASCII-compatible because the bytes
* 1 to 127 can occur in multi-byte representations of non-ASCII
* characters.
*
* The byte 0 has been excluded from this definition because the C
* language uses it with a special meaning that has nothing to do with
* characters, so it is questionable to interpret the byte 0 anyway.
*)
val is_single_byte : encoding -> bool
(** Returns whether the encoding is a single-byte encoding *)
val same_encoding : encoding -> encoding -> bool
(** Whether both encodings are the same. [`Enc_subset] encodings are only
* considered as equal when the definition functions are physically the same.
*
* Warning: Don't use ( = ) to compare encodings because this may
* fail.
*)
val byte_order_mark : encoding -> string
(** Returns the byte order mark that must occur at the beginning of
* files to indicate whether "little endian" or "big endian" is used.
* If this does not apply to the encoding, an empty string is returned.
*
* See also the section about "{!Netconversion.bom}" below.
*)
val makechar : encoding -> int -> string
(** [makechar enc i:]
* Creates the string representing the Unicode code point [i] in encoding
* [enc]. Raises [Not_found] if the character is legal but cannot be
* represented in [enc].
*
* Possible encodings: everything but [`Enc_utf16].
*
* Evaluation hints:
* - PRE_EVAL(encoding)
*
* @deprecated This function is deprecated since ocamlnet-0.96. Use
* [ustring_of_uchar] instead.
*)
val ustring_of_uchar : encoding -> int -> string
(** [ustring_of_uchar enc i]:
* Creates the string representing the Unicode code point [i] in encoding
* [enc]. Raises [Cannot_represent i] if the character is legal but cannot be
* represented in [enc].
*
* Possible encodings: everything but [`Enc_utf16].
*
* Evaluation hints:
* - PRE_EVAL(encoding)
*)
val to_unicode : charset -> int -> int
(** Maps the code point of the charset to the corresponding
* Unicode code point, or raises [Malformed_code], when the
* input number does not correspond to a code point.
*
* Note [`Set_jis0208] and [`Set_jis0212]: Code points are usually
* given by a row and column number. The numeric code point returned by
* this function is computed by multiplying the row number (1..94) with 96,
* and by adding the column number (1..94), i.e. row*96+column.
*
* Evaluation hints:
* - PRE_EVAL(charset)
*)
val from_unicode : charset -> int -> int
(** Maps the Unicode code point to the corresponding code point of
* the charset, or raises [Cannot_represent] when there is no such
* corresponding code point.
*
* Note [`Set_jis0208] and [`Set_jis0212]: Code points are usually
* given by a row and column number. The numeric code point returned by
* this function is computed by multiplying the row number (1..94) with 96,
* and by adding the column number (1..94), i.e. row*96+column.
*
* Evaluation hints:
* - PRE_EVAL(charset)
*)
val available_input_encodings : unit -> encoding list
(** Returns the list of all available encodings that can be used for
* input strings. The list reflects the set of loadable/linked [Netmapping]
* modules.
*)
val available_output_encodings : unit -> encoding list
(** Returns the list of all available encodings that can be used for
* output strings. The list reflects the set of loadable/linked [Netmapping]
* modules.
*)
(**********************************************************************)
(* Conversion between character encodings *)
(**********************************************************************)
(** {2:direct_conv Direct Conversion} *)
(** In order to convert a string from one encoding to another, call
* [convert] like in
*
* {[ let s_utf8 =
* convert ~in_enc:`Enc_iso88591 ~out_enc:`Enc_utf8 s_latin1 ]}
*
* which converts the ISO-8859-1 string [s_latin1] to the UTF-8 string
* [s_utf8].
*
* It is also possible to convert while reading from or writing to a file.
* This use case is effectively handled by the class
* {!Netconversion.conversion_pipe}.
* See the explanations of this class for examples.
*)
val convert : ?subst:(int -> string) ->
in_enc:encoding ->
out_enc:encoding ->
?range_pos:int -> ?range_len:int ->
string ->
string
(** Converts the string from [in_enc] to [out_enc], and returns it.
* The string must consist of a whole number of characters. If it
* ends with an incomplete multi-byte character, however, this is
* detected, and the exception [Malformed_code] will be raised.
* This exception is also raised for other encoding errors in the
* input string.
*
* @param subst This function is invoked for code points of [in_enc] that
* cannot be represented in [out_enc], and the result of the function
* invocation is substituted (directly, without any further conversion).
* Restriction: The string returned by [subst] must not be longer than 50
* bytes.
* If [subst] is missing, [Cannot_represent] is raised in this case.
*
* @param range_pos Selects a substring for conversion. [range_pos]
* is the byte position of the first character of the substring.
* (Default: 0)
*
* @param range_len Selects a substring for conversion. [range_len]
* is the length of the substring in bytes (Default: Length
* of the input string minus [range_pos])
*)
val recode_string : in_enc:encoding ->
out_enc:encoding ->
?subst:(int -> string) ->
string ->
string
(** Recodes a complete string from [in_enc] to [out_enc], and returns it.
* The function [subst] is invoked for code points of [in_enc] that cannot
* be represented in [out_enc], and the result of the function invocation
* is substituted.
* Restriction: The string returned by [subst] must not be longer than 50
* bytes.
* If [subst] is missing, [Not_found] is raised in this case.
*
* @deprecated This function is obsolete since ocamlnet-0.96. Use
* [convert] instead.
*)
val recode : in_enc:encoding ->
in_buf:string ->
in_pos:int ->
in_len:int ->
out_enc:encoding ->
out_buf:string ->
out_pos:int ->
out_len:int ->
max_chars:int ->
subst:(int -> string) -> (int * int * encoding)
(**
* Converts the character sequence contained in the at most [in_len] bytes
* of [in_buf] starting at byte position [in_pos], and writes the result
* into at most [out_len] bytes of [out_buf] starting at byte position
* [out_pos]. At most [max_chars] characters are converted from
* [in_buf] to [out_buf].
*
* The characters in [in_buf] are assumed to be encoded as [in_enc], and the
* characters in [out_buf] will be encoded as [out_enc]. The case
* [in_enc = out_enc] is not handled specially, and is carried out as
* fast as any other conversion.
*
* If there is a code point which cannot be represented in [out_enc],
* the function [subst] is called with the code point as argument, and the
* resulting string (which must already be encoded as [out_enc]) is
* inserted instead.
* It is possible that [subst] is called several times for the same
* character. Restriction: The string returned by subst must not be longer
* than 50 bytes.
*
* It is allowed that the input buffer ends with an incomplete
* multi-byte character. This character is not converted, i.e. the
* conversion ends just before this character. This special condition
* is not indicated to the caller.
*
* @return The triple [(in_n, out_n, in_enc')] is returned:
* - [in_n] is the actual number of bytes that have been converted from
* [in_buf]; [in_n] may be smaller than [in_len] because of incomplete
* multi-byte characters, or because the output buffer has less space
* for characters than the input buffer, or because of a change
* of the encoding variant.
* - [out_n] is the actual number of bytes written into [out_buf].
* - [in_enc'] is normally identical to [in_enc]. However, there are cases
* where the encoding can be refined when looking at the byte
* sequence; for example whether a little endian or big endian variant
* of the encoding is used. [in_enc'] is the variant of [in_enc] that was
* used for the last converted character.
*
* If there is at least one complete character in [in_buf], and at least
* space for one complete character in [out_buf], and [max_chars >= 1], it is
* guaranteed that [in_n > 0 && out_n > 0].
*)
class conversion_pipe :
?subst:(int -> string) ->
in_enc:encoding ->
out_enc:encoding ->
unit ->
Netchannels.io_obj_channel
(** This pipeline class (see [Netchannels] for more information) can be used
* to recode a netchannel while reading or writing. The argument [in_enc]
* is the input encoding, and [out_enc] is the output encoding.
*
* The channel must consist of a whole number of characters. If it
* ends with an incomplete multi-byte character, however, this is
* detected, and the exception [Malformed_code] will be raised.
* This exception is also raised for other encoding errors in the
* channel data.
*
* {b Example.} Convert ISO-8859-1 to UTF-8 while writing to the file
* ["output.txt"]:
*
* {[
* let ch = new output_channel (open_out "output.txt") in
* let encoder =
* new conversion_pipe ~in_enc:`Enc_iso88591 ~out_enc:`Enc_utf8 () in
* let ch' = new output_filter encoder ch in
* ... (* write to ch' *)
* ch' # close_out();
* ch # close_out(); (* you must close both channels! *)
* ]}
*
* If you write as UTF-16, don't forget to output the byte order
* mark yourself, as the channel does not do this.
*
* {b Example.} Convert UTF-16 to UTF-8 while reading from the file
* ["input.txt"]:
*
* {[
* let ch = new input_channel (open_in "input.txt") in
* let encoder =
* new conversion_pipe ~in_enc:`Enc_utf16 ~out_enc:`Enc_utf8 () in
* let ch' = new input_filter ch encoder in
* ... (* read from ch' *)
* ch' # close_in();
* ch # close_in(); (* you must close both channels! *)
* ]}
*
* @param subst This function is invoked for code points of [in_enc] that
* cannot be represented in [out_enc], and the result of the function
* invocation is substituted (directly, without any further conversion).
* Restriction: The string returned by [subst] must not be longer than 50
* bytes.
* If [subst] is missing, [Cannot_represent] is raised in this case.
*)
class recoding_pipe :
?subst:(int -> string) ->
in_enc:encoding ->
out_enc:encoding ->
unit ->
Netchannels.io_obj_channel
(** Recodes a channel like [conversion_pipe]. The difference is that
* [subst] raises [Not_found] by default, and not [Cannot_represent].
*
* @deprecated This class is deprecated since ocamlnet-0.96. Use
* [conversion_pipe] instead.
*)
(**********************************************************************)
(* Cursors *)
(**********************************************************************)
(** {2:cursors Reading Text Using Cursors}
*
* A cursor is a reference to a character in an encoded string. The
* properties of the current character can be obtained, and the cursor
* can be moved relative to its current position.
*
* For example, the following loop outputs the Unicode code points
* of all characters of the UTF-8 input string [s]:
*
* {[
* let cs = create_cursor `Enc_utf8 s in
* while not (cursor_at_end cs) do
* let n = cursor_char_count cs in
* let ch = uchar_at cs in
* printf "At position %d: %d\n" n ch;
* move cs;
* done
* ]}
*
* For a more exact definition, cursors are modeled as follows: The reference
* to the encoded string is contained in the cursor. This
* can be a complete string, or an arbitrary substring (denoted by a
* range of valid byte positions). The cursor
* position can be initially set to an arbitrary byte position of the
* encoded string.
*
* Cursor positions can be denoted by
* - byte positions [p] in the encoded string, or by
* - character counts [n] relative to the initial position.
*
* Valid cursor positions are:
* - [n=0]: This is always the initial cursor position
* - [n>0]: Positive char counts refer to characters right to the initial
* character. The rightmost position is the position [n_max] past the
* rightmost character. The rightmost position does not have a
* code point.
* - [n<0]: Negative char counts refer to characters left to the initial
* character. The leftmost position is the position [n_min] of the
* leftmost character.
*
* For the empty string we have [n_min = n_max = 0], complementing the
* above definition.
*
* Cursors are moved to the left or right of their current position
* by a whole number of characters. When it is tried to move them
* past the leftmost or rightmost position, the cursor is placed to the
* leftmost or rightmost position, respectively, and the exception
* [Cursor_out_of_range] is raised.
*
* There are two cases of illegal encodings:
* - When the last byte sequence of the encoded string is an incomplete
* multi-byte character, this is detected, and the special exception
* [Partial_character] is raised when the code point of this character
* is read. Note that this can only happen at position [n_max-1]. It
* is allowed to move beyond this character to [n_max].
* - When an illegal byte sequence occurs in the encoded string (including
* an incomplete multi-byte character at the beginning of the string),
* it is not possible to move the cursor to this character, or across
* this character. When it is tried to do so, the cursor stops just
* before the bad sequence, and the exception [Malformed_code] is
* raised.
*
* It is undefined what happens when the encoded string is modified
* while a cursor is in use referring to it.
*)
type cursor
(** A cursor denotes a character position in an encoded string *)
exception End_of_string
(** Raised when it is tried to access the character after the end of the
* string (at position [n_max])
*)
exception Cursor_out_of_range
(** Raised when it is tried to move the cursor beyond the beginning of the
* string or beyond the end of the string. In the latter case, it is
* legal to move the cursor to the position following the last character,
* but it is not possible to move it further.
*)
exception Partial_character
(** Raised when the last character of the string is an incomplete
* multi-byte character, and it is tried to get the code point
* (using [uchar_at]).
*)
exception Byte_order_mark
(** Raised when it is tried to get the code point of the BOM at the
* beginning of the string
*)
val create_cursor : ?range_pos:int -> ?range_len:int ->
?initial_rel_pos:int ->
encoding -> string -> cursor
(** Creates a new cursor for the passed string and the passed encoding.
* By default, the allowed range of the cursor is the whole string,
* and the cursor is intially positioned at the beginning of the string.
* The {b range} is the part of the string the cursor can move within.
*
* {b Special behaviour for [`Enc_utf16]:} UTF-16 with unspecified
* endianess is handled specially. First, this encoding is only
* accepted when [initial_rel_pos=0]. Second, the first two bytes
* must be a byte order mark (BOM) (if the string has a length of two
* bytes or more). The BOM counts as character without code point.
* The function [uchar_at] raises the exception [Byte_order_mark]
* when the BOM is accessed. Third, when the cursor is moved to the
* next character, the encoding as returned by [cursor_encoding] is
* changed to either [`Enc_utf16_le] or [`Enc_utf16_be] according
* to the BOM. The encoding changes back to [`Enc_utf16] when the
* cursor is moved back to the initial position.
*
* @param range_pos Restricts the range of the cursor to a substring.
* The argument [range_pos] is the byte position of the beginning
* of the range. (Defaults to 0)
* @param range_len Restricts the range of the cursor to a substring.
* The argument [range_len] is the length of the range.
* (Default: Length of the input string minus [range_pos])
* @param initial_rel_pos The initial position of the cursor, given
* as bytes relative to [range_pos]. The character at this position
* is considered as the zeroth character of the string (as reported
* by [cursor_char_count])
*)
val reinit_cursor : ?range_pos:int -> ?range_len:int ->
?initial_rel_pos:int ->
?enc:encoding -> string -> cursor -> unit
(** Reuses an existing cursor for a new purpose. The arguments are
* as in [create_cursor].
*)
val copy_cursor : ?enc:encoding -> cursor -> cursor
(** Copies the cursor. The copy can be moved independently of the original
* cursor, but is applied to the same string. The copy starts at the
* byte position of the string where the original cursor is currently
* positioned.
*
* @param enc Optionally, the assumed
* encoding can be changed to a different one by passing [enc].
*)
val cursor_target : cursor -> string
(** Returns the string of the cursor
*
* Evaluation hints:
* - INLINED
*)
val cursor_range : cursor -> (int * int)
(** Returns the valid range of the cursor as pair [(range_pos, range_len)]
*
* Evaluation hints:
* - INLINED
*)
val cursor_initial_rel_pos : cursor -> int
(** Returns the initial relative byte position of the cursor
*
* Evaluation hints:
* - INLINED
*)
val cursor_char_count : cursor -> int
(** Returns the character count of the cursor. The initial position
* (when [create_cursor] was called) has the number 0, positions to the
* right denote positive numbers, and positions to the left negative numbers.
*
* Evaluation hints:
* - INLINED
*)
val cursor_pos : cursor -> int
(** Returns the byte position of the cursor, i.e. the byte index of
* the string that corresponds to the cursor position. The function
* returns the absolute position (i.e. NOT relative to [cursor_range]).
*
* Evaluation hints:
* - INLINED
*)
val uchar_at : cursor -> int
(** Returns the Unicode code point of the character at the cursor.
* Raises [End_of_string] if the cursor is positioned past the last
* character.
* Raises [Partial_character] if the last character of the analysed
* string range is an incomplete multi-byte character.
* Raises [Byte_order_mark] if the first character of the string
* is a BOM (when the encoding has BOMs).
*
* Evaluation hints:
* - INLINED
*)
val cursor_byte_length : cursor -> int
(** Returns the byte length of the representation of the character at the
* cursor. This works also for incomplete multi-byte characters and
* BOMs.
* Raises [End_of_string] if the cursor is positioned past the last
* character.
*
* Evaluation hints:
* - INLINED
*)
val cursor_at_end : cursor -> bool
(** Returns whether the cursor is positioned past the last character.
*
* Evaluation hints:
* - INLINED
*)
val move : ?num:int -> cursor -> unit
(** Moves the cursor one character to the right, or if [num] is passed,
* this number of characters to the right. [num] can be negative in
* which case the cursor is moved to the left.
*
* If the cursor were placed outside the valid range, the cursor
* would go into an illegal state, and because of this, this is
* handled as follows: the cursor moves to the
* leftmost or rightmost position (depending on the direction),
* and the exception [Cursor_out_of_range] is raised.
*)
val cursor_encoding : cursor -> encoding
(** Returns the encoding of the cursor. For some encodings, the
* returned encoding depends on the position of the cursor (see
* the note about UTF-8 in [create_cursor])
*
* Evaluation hints:
* - INLINED
*)
val cursor_blit : cursor -> int array -> int -> int -> int
(** [cursor_blit cs ua pos len]: Copies at most [len] characters as code
* points from
* the cursor position and the following positions to the array [ua]
* at index [pos]. The number of copied characters is returned.
* If the cursor is already at the end of the string when this
* function is called, the exception [End_of_string] will be raised instead,
* and no characters are copied. The cursor positions containing byte
* order marks and partial characters are never copied; this is ensured
* by stopping the copying procedure just before these positions. This
* may even make the function return the number 0.
*
* The function tries to copy as many characters as currently available
* in the already decoded part of the string the cursor is attached to.
* In the current implementation, this number is not higher than 250.
* You can call [cursor_blit_maxlen] to get an upper limit.
*
* The function does not move the cursor.
*)
val cursor_blit_maxlen : cursor -> int
(** Returns the maximum number of characters [cursor_blit] can copy
* at the current cursor position. This is the number of characters
* [cursor_blit] would copy if the [len] argument were arbitrarily
* large.
*
* Note that the value depends on the cursor position and on the
* contents of the cursor string.
*
* This function raises [End_of_string] if the cursor is positioned
* at the end of the string.
*)
val cursor_blit_positions : cursor -> int array -> int -> int -> int
(** Works like [cursor_blit], but copies the byte positions of the
* characters into [ua] instead of the code points.
*
* When called directly after [cursor_blit] for the same cursor and
* with the same value of [len], this function copies as many characters
* and thus returns the same number:
*
* {[let n1 = cursor_blit cs ua ua_pos len in
* let n2 = cursor_blit_pos cs pa pa_pos len in
* assert (n1 = n2)]}
*)
(** {3:bom Byte Order Marks}
*
* Because UTF-16 allows both little and big endian, files and other
* permanent representations of UTF-16 text are usually prepended by
* a byte order mark (BOM). There is confusion about the BOM among
* Unicode users, so the following explanations may be helpful.
*
* Of course, the BOM is only used for external representations like
* files, as the endianess is always known for in-memory representations
* by the running program. This module has three encoding identifiers:
* - [`Enc_utf16]: UTF-16 where the endianess is unknown
* - [`Enc_utf16_le]: UTF-16 little endian
* - [`Enc_utf16_be]: UTF-16 big endian
*
* When a file is read, the endianess is unknown at the beginning.
* This is expressed by [`Enc_utf16]. When the BOM is read, the encoding
* is refined to either [`Enc_utf16_le] or [`Enc_utf16_be], whatever
* the BOM says. This works as follows: The BOM is the representation
* of the code point 0xfeff as little or big endian, i.e. as byte sequences
* "0xfe 0xff" (big endian) or "0xff 0xfe" (little endian). As the "wrong"
* code point 0xfffe is intentionally unused, the reader can determine
* the endianess.
*
* There is one problem, though. Unfortunately, the code point 0xfeff
* is also used for the "zero width non-breakable space" character.
* When this code point occurs later in the text, it is interpreted as
* this character. Of course, this means that one must know whether
* there is a BOM at the beginning, and if not, one must know the
* endianess. One cannot program in the style "well, let's see what is
* coming and guess".
*
* Furthermore, the BOM is only used for encodings where one can specify
* the endianess. It must not be used for UTF-8, for example, as the
* byte order is fixed for this encoding. When a UTF-8 text begins with
* the code point 0xfeff, it is always the "zero width non-breakable space"
* character.
*
* The functions of this module can all deal with BOMs when reading
* encoded text. In most cases, the BOM is hidden from the caller,
* and just handled automatically. Cursors, however, treat BOMs as special
* characters outside of the code set (exception [Byte_order_mark] is
* raised). The writing functions of this module do not generate BOMs,
* however, as there is no way to tell them that a BOM is needed. The
* function [byte_order_mark] can be used to output the BOM manually.
*
* {3 Examples for Cursors}
*
* Create the cursor:
*
* [ let cs = create_cursor `Enc_utf8 "B\195\164r";; ]
*
* The cursor is now positioned at the 'B':
*
* [ uchar_at cs ] {i returns} [66] (i.e. B)
*
* Move the cursor one character to the right. In UTF-8, this is a
* two-byte character consisting of the bytes 195 and 164:
*
* [ move cs ;; ]
*
* [ uchar_at cs ] {i returns} [228] (i.e. a-Umlaut)
*
* One can easily move the cursor to the end of the string:
*
* [ move ~num:max_int cs ;; ]
*
* This raises [Cursor_out_of_range], but places the cursor at the end.
* This is the position past the last letter 'r':
*
* [ uchar_at cs ] {i raises} [End_of_string]
*
* Go one character to the left:
*
* [ move ~num:(-1) cs ;; ]
*
* [ uchar_at cs ] {i returns} [114] (i.e. r)
*
* Cursors can only move relative to their current position. Of course,
* one can easily write a function that moves to an absolute position,
* like
*
* {[ let move_abs n cs =
* let delta = n - cursor_pos cs in
* move ~num:delta cs ]}
*
* However, this operation is expensive (O(string length)), and should
* be avoided for efficient algorithms. Cursors are not arrays, and an
* algorithm should only be based on cursors when it is possible to
* iterate over the characters of the string one after another.
*)
(**********************************************************************)
(* String functions *)
(**********************************************************************)
(** {2:unicode_functions Unicode String Functions} *)
val ustring_length :
encoding -> ?range_pos:int -> ?range_len:int -> string -> int
(** Returns the length of the string in characters. The function fails
* when illegal byte sequences or incomplete characters are found in the
* string with [Malformed_code].
*
* Evaluation hints:
* - PRE_EVAL(encoding)
*
* @param range_pos The byte position of the substring to measure
* (default: 0)
* @param range_len The byte length of the substring to measure
* (default: byte length of the input string minus [range_pos])
*)
val ustring_iter :
encoding ->
(int -> unit) ->
?range_pos:int -> ?range_len:int ->
string ->
unit
(** Iterates over the characters of a string, and calls the passed function
* for every code point. The function raises [Malformed_code] when
* illegal byte sequences or incomplete characters are found.
*
* @param encoding specifies the encoding
* @param range_pos The byte position of the substring to iterate over
* (default: 0)
* @param range_len The byte length of the substring to iterate over
* (default: byte length of the input string minus [range_pos])
*)
val ustring_map :
encoding ->
(int -> int list) ->
?range_pos:int -> ?range_len:int ->
string ->
string
(** Maps every character of a string to a list of characters, and returns
* the concatenated string.
* The [encoding] argument determines the encoding of both the argument
* and the result string.
* The map function gets every character as its Unicode code point, and
* must return the list of code points to map to.
*
* The function raises [Malformed_code] when
* illegal byte sequences or incomplete characters are found.
*
* @param range_pos The byte position of the substring to map
* (default: 0)
* @param range_len The byte length of the substring to map
* (default: byte length of the input string minus [range_pos])
*)
val ustring_sub :
encoding ->
int ->
int ->
?range_pos:int -> ?range_len:int ->
string ->
string
(** [ustring_sub enc start length s]: Returns the substring of [s] starting
* at character count [start] and consisting of [length] characters. Note
* that [start] and [length] select the substring by multiples of
* (usually multibyte) characters, not bytes.
*
* If the optional byte-based [range_pos] and [range_len] arguments are
* present, these arguments are taken to determine a first substring
* before [start] and [length] are applied to extract the final
* substring.
*
* The function raises [Malformed_code] when
* illegal byte sequences or incomplete characters are found.
*
* @param range_pos The byte position of the substring to extract
* (default: 0)
* @param range_len The byte length of the substring to extract
* (default: byte length of the input string minus [range_pos])
*)
val ustring_compare :
encoding ->
(int -> int -> int) ->
?range_pos:int -> ?range_len:int ->
string ->
?range_pos:int -> ?range_len:int ->
string ->
int
(** Compares two strings lexicographically. The first argument is the
* encoding of both strings (which must be the same). The second argument
* is the function that compares two Unicode code points. It must return
* 0 if both characters are the same, a negative value if the first
* character is the smaller one, and a positive value if the second
* character is the smaller one.
*
* The function raises [Malformed_code] when
* illegal byte sequences or incomplete characters are found.
*
* @param range_pos The byte position of the substring to compare
* (default: 0), referring to the following string argument
* @param range_len The byte length of the substring to compare
* (default: byte length of the input string minus [range_pos]),
* referring to the following string argument
*)
val uarray_of_ustring :
encoding ->
?range_pos:int -> ?range_len:int ->
string ->
int array
(** Returns the characters of the string as array of Unicode code points.
*
* @param range_pos The byte position of the substring to extract
* (default: 0)
* @param range_len The byte length of the substring to extract
* (default: byte length of the input string minus [range_pos])
*)
val ustring_of_uarray :
?subst:(int -> string) ->
encoding ->
?pos:int -> ?len:int ->
int array ->
string
(** Returns the array of Unicode code points as encoded string.
*
* @param pos Selects a subarray: [pos] is the first array position
* to encode (default: 0)
* @param len Selects a subarray: [len] is the length of the subarray
* to encode (default: array length minus [pos])
* @param subst This function is called when a code point cannot be represented
* in the chosen character encoding. It must returns the (already encoded)
* string to substitute for this code point. By default (if ~subst is
* not passed), the exception [Cannot_represent] will be raised in this
* case.
*)
exception Malformed_code_at of int
(** An illegal byte sequence is found at this byte position *)
val verify : encoding -> ?range_pos:int -> ?range_len:int -> string -> unit
(** Checks whether the string is properly encoded. If so, () is returned.
* If not, the exception [Malformed_code_at] will be raised indicating
* the byte position where the problem occurs.
*
* @param range_pos The byte position of the substring to verify
* (default: 0)
* @param range_len The byte length of the substring to verify
* (default: byte length of the input string minus [range_pos])
*)
(**********************************************************************)
(* Internal *)
(**/**)
val big_slice : int
(* The length of the normal cursor slices. A "small slice" has always
* length 1.
*)
val read_iso88591_ref :
(int -> encoding -> int array -> int array -> string ->
int -> int -> (int*int*encoding)) ref
val read_utf8_ref :
(bool -> int array -> int array -> string -> int -> int -> (int*int*encoding))
ref
(* The two read_* variables are initialised with default implementations.
* They are overriden by Netaccel (if linked)
*)