Commit | Line | Data |
---|---|---|
23324ae1 FM |
1 | ///////////////////////////////////////////////////////////////////////////// |
2 | // Name: convauto.h | |
e54c96f1 | 3 | // Purpose: interface of wxConvAuto |
23324ae1 FM |
4 | // Author: wxWidgets team |
5 | // RCS-ID: $Id$ | |
526954c5 | 6 | // Licence: wxWindows licence |
23324ae1 FM |
7 | ///////////////////////////////////////////////////////////////////////////// |
8 | ||
038809c2 VZ |
9 | /** |
10 | Constants representing various BOM types. | |
11 | ||
12 | BOM is an abbreviation for "Byte Order Mark", a special Unicode character | |
13 | which may be inserted into the beginning of a text stream to indicate its | |
14 | encoding. | |
15 | ||
16 | @since 2.9.3 | |
17 | */ | |
18 | enum wxBOM | |
19 | { | |
20 | /** | |
21 | Unknown BOM. | |
22 | ||
23 | This is returned if BOM presence couldn't be determined and normally | |
24 | happens because not enough bytes of input have been analysed. | |
25 | */ | |
26 | wxBOM_Unknown = -1, | |
27 | ||
28 | /** | |
29 | No BOM. | |
30 | ||
31 | The stream doesn't contain BOM character at all. | |
32 | */ | |
33 | wxBOM_None, | |
34 | ||
35 | /** | |
36 | UTF-32 big endian BOM. | |
37 | ||
38 | The stream is encoded in big endian variant of UTF-32. | |
39 | */ | |
40 | wxBOM_UTF32BE, | |
41 | ||
42 | /** | |
43 | UTF-32 little endian BOM. | |
44 | ||
45 | The stream is encoded in little endian variant of UTF-32. | |
46 | */ | |
47 | wxBOM_UTF32LE, | |
48 | ||
49 | /** | |
50 | UTF-16 big endian BOM. | |
51 | ||
52 | The stream is encoded in big endian variant of UTF-16. | |
53 | */ | |
54 | wxBOM_UTF16BE, | |
55 | ||
56 | /** | |
57 | UTF-16 little endian BOM. | |
58 | ||
59 | The stream is encoded in little endian variant of UTF-16. | |
60 | */ | |
61 | wxBOM_UTF16LE, | |
62 | ||
63 | /** | |
64 | UTF-8 BOM. | |
65 | ||
66 | The stream is encoded in UTF-8. | |
67 | ||
68 | Notice that contrary to a popular belief, it's perfectly possible and, | |
69 | n fact, common under Microsoft Windows systems, to have a BOM in an | |
70 | UTF-8 stream: while it's not used to indicate the endianness of UTF-8 | |
71 | stream (as it's byte-oriented), the BOM can still be useful just as an | |
72 | unambiguous indicator of UTF-8 being used. | |
73 | */ | |
74 | wxBOM_UTF8 | |
75 | }; | |
76 | ||
23324ae1 FM |
77 | /** |
78 | @class wxConvAuto | |
7c913512 | 79 | |
23324ae1 FM |
80 | This class implements a Unicode to/from multibyte converter capable of |
81 | automatically recognizing the encoding of the multibyte text on input. The | |
82 | logic used is very simple: the class uses the BOM (byte order mark) if it's | |
bd0812fe BP |
83 | present and tries to interpret the input as UTF-8 otherwise. If this fails, |
84 | the input is interpreted as being in the default multibyte encoding which | |
85 | can be specified in the constructor of a wxConvAuto instance and, in turn, | |
86 | defaults to the value of GetFallbackEncoding() if not explicitly given. | |
7c913512 | 87 | |
23324ae1 FM |
88 | For the conversion from Unicode to multibyte, the same encoding as was |
89 | previously used for multibyte to Unicode conversion is reused. If there had | |
90 | been no previous multibyte to Unicode conversion, UTF-8 is used by default. | |
bd0812fe BP |
91 | Notice that once the multibyte encoding is automatically detected, it |
92 | doesn't change any more, i.e. it is entirely determined by the first use of | |
93 | wxConvAuto object in the multibyte-to-Unicode direction. However creating a | |
94 | copy of wxConvAuto object, either via the usual copy constructor or | |
95 | assignment operator, or using wxMBConv::Clone(), resets the automatically | |
96 | detected encoding so that the new copy will try to detect the encoding of | |
97 | the input on first use. | |
7c913512 | 98 | |
bd0812fe BP |
99 | This class is used by default in wxWidgets classes and functions reading |
100 | text from files such as wxFile, wxFFile, wxTextFile, wxFileConfig and | |
101 | various stream classes so the encoding set with its SetFallbackEncoding() | |
102 | method will affect how these classes treat input files. In particular, use | |
103 | this method to change the fall-back multibyte encoding used to interpret | |
104 | the contents of the files whose contents isn't valid UTF-8 or to disallow | |
105 | it completely. | |
7c913512 | 106 | |
23324ae1 | 107 | @library{wxbase} |
bd0812fe | 108 | @category{data} |
7c913512 | 109 | |
bd0812fe | 110 | @see @ref overview_mbconv |
23324ae1 FM |
111 | */ |
112 | class wxConvAuto : public wxMBConv | |
113 | { | |
114 | public: | |
115 | /** | |
bd0812fe BP |
116 | Constructs a new wxConvAuto instance. The object will try to detect the |
117 | input of the multibyte text given to its wxMBConv::ToWChar() method | |
118 | automatically but if the automatic detection of Unicode encodings | |
119 | fails, the fall-back encoding @a enc will be used to interpret it as | |
120 | multibyte text. | |
121 | ||
122 | The default value of @a enc, @c wxFONTENCODING_DEFAULT, means that the | |
123 | global default value (which can be set using SetFallbackEncoding()) | |
124 | should be used. As with that method, passing @c wxFONTENCODING_MAX | |
125 | inhibits using this encoding completely so the input multibyte text | |
126 | will always be interpreted as UTF-8 in the absence of BOM and the | |
127 | conversion will fail if the input doesn't form valid UTF-8 sequence. | |
128 | ||
129 | Another special value is @c wxFONTENCODING_SYSTEM which means to use | |
130 | the encoding currently used on the user system, i.e. the encoding | |
131 | returned by wxLocale::GetSystemEncoding(). Any other encoding will be | |
132 | used as is, e.g. passing @c wxFONTENCODING_ISO8859_1 ensures that | |
133 | non-UTF-8 input will be treated as latin1. | |
23324ae1 FM |
134 | */ |
135 | wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT); | |
136 | ||
038809c2 VZ |
137 | |
138 | /** | |
139 | Return the detected BOM type. | |
140 | ||
141 | The BOM type is detected after sufficiently many initial bytes have | |
142 | passed through this conversion object so it will always return | |
143 | wxBOM_Unknown immediately after the object creation but may return a | |
144 | different value later. | |
145 | ||
146 | @since 2.9.3 | |
147 | */ | |
148 | wxBOM GetBOM() const; | |
149 | ||
64b91e2d VZ |
150 | /** |
151 | Return a pointer to the characters that makes up this BOM. | |
152 | ||
153 | The returned character count is 2, 3 or 4, or undefined if the return | |
154 | value is NULL. | |
155 | ||
156 | @param bom | |
157 | A valid BOM type, i.e. not wxBOM_Unknown or wxBOM_None. | |
158 | @param count | |
159 | A non-@NULL pointer receiving the number of characters in this BOM. | |
160 | @return | |
161 | Pointer to characters composing the BOM or @NULL if BOM is unknown | |
162 | or invalid. Notice that the returned string is not NUL-terminated | |
163 | and may contain embedded NULs so @a count must be used to handle it | |
164 | correctly. | |
165 | ||
166 | @since 2.9.3 | |
167 | */ | |
168 | const char* GetBOMChars(wxBOM bom, size_t* count); | |
169 | ||
23324ae1 | 170 | /** |
bd0812fe BP |
171 | Disable the use of the fall back encoding: if the input doesn't have a |
172 | BOM and is not valid UTF-8, the conversion will fail. | |
23324ae1 FM |
173 | */ |
174 | static void DisableFallbackEncoding(); | |
175 | ||
176 | /** | |
bd0812fe BP |
177 | Returns the encoding used by default by wxConvAuto if no other encoding |
178 | is explicitly specified in constructor. By default, returns | |
7c913512 | 179 | @c wxFONTENCODING_ISO8859_1 but can be changed using |
bd0812fe | 180 | SetFallbackEncoding(). |
23324ae1 FM |
181 | */ |
182 | static wxFontEncoding GetFallbackEncoding(); | |
183 | ||
184 | /** | |
bd0812fe BP |
185 | Changes the encoding used by default by wxConvAuto if no other encoding |
186 | is explicitly specified in constructor. The default value, which can be | |
187 | retrieved using GetFallbackEncoding(), is @c wxFONTENCODING_ISO8859_1. | |
188 | ||
189 | Special values of @c wxFONTENCODING_SYSTEM or @c wxFONTENCODING_MAX can | |
190 | be used for the @a enc parameter to use the encoding of the current | |
191 | user locale as fall back or not use any encoding for fall back at all, | |
192 | respectively (just as with the similar constructor parameter). However, | |
193 | @c wxFONTENCODING_DEFAULT can't be used here. | |
23324ae1 FM |
194 | */ |
195 | static void SetFallbackEncoding(wxFontEncoding enc); | |
e54c96f1 | 196 | |
038809c2 VZ |
197 | /** |
198 | Return the BOM type of this buffer. | |
199 | ||
200 | This is a helper function which is normally only used internally by | |
201 | wxConvAuto but provided for convenience of the code that wants to | |
202 | detect the encoding of a stream by checking it for BOM presence on its | |
203 | own. | |
204 | ||
205 | @since 2.9.3 | |
206 | */ | |
207 | static wxBOM DetectBOM(const char *src, size_t srcLen); | |
208 | }; |