]>
Commit | Line | Data |
---|---|---|
23324ae1 FM |
1 | ///////////////////////////////////////////////////////////////////////////// |
2 | // Name: convauto.h | |
e54c96f1 | 3 | // Purpose: interface of wxConvAuto |
23324ae1 | 4 | // Author: wxWidgets team |
526954c5 | 5 | // Licence: wxWindows licence |
23324ae1 FM |
6 | ///////////////////////////////////////////////////////////////////////////// |
7 | ||
038809c2 VZ |
8 | /** |
9 | Constants representing various BOM types. | |
10 | ||
11 | BOM is an abbreviation for "Byte Order Mark", a special Unicode character | |
12 | which may be inserted into the beginning of a text stream to indicate its | |
13 | encoding. | |
14 | ||
15 | @since 2.9.3 | |
16 | */ | |
17 | enum wxBOM | |
18 | { | |
19 | /** | |
20 | Unknown BOM. | |
21 | ||
22 | This is returned if BOM presence couldn't be determined and normally | |
23 | happens because not enough bytes of input have been analysed. | |
24 | */ | |
25 | wxBOM_Unknown = -1, | |
26 | ||
27 | /** | |
28 | No BOM. | |
29 | ||
30 | The stream doesn't contain BOM character at all. | |
31 | */ | |
32 | wxBOM_None, | |
33 | ||
34 | /** | |
35 | UTF-32 big endian BOM. | |
36 | ||
37 | The stream is encoded in big endian variant of UTF-32. | |
38 | */ | |
39 | wxBOM_UTF32BE, | |
40 | ||
41 | /** | |
42 | UTF-32 little endian BOM. | |
43 | ||
44 | The stream is encoded in little endian variant of UTF-32. | |
45 | */ | |
46 | wxBOM_UTF32LE, | |
47 | ||
48 | /** | |
49 | UTF-16 big endian BOM. | |
50 | ||
51 | The stream is encoded in big endian variant of UTF-16. | |
52 | */ | |
53 | wxBOM_UTF16BE, | |
54 | ||
55 | /** | |
56 | UTF-16 little endian BOM. | |
57 | ||
58 | The stream is encoded in little endian variant of UTF-16. | |
59 | */ | |
60 | wxBOM_UTF16LE, | |
61 | ||
62 | /** | |
63 | UTF-8 BOM. | |
64 | ||
65 | The stream is encoded in UTF-8. | |
66 | ||
67 | Notice that contrary to a popular belief, it's perfectly possible and, | |
68 | n fact, common under Microsoft Windows systems, to have a BOM in an | |
69 | UTF-8 stream: while it's not used to indicate the endianness of UTF-8 | |
70 | stream (as it's byte-oriented), the BOM can still be useful just as an | |
71 | unambiguous indicator of UTF-8 being used. | |
72 | */ | |
73 | wxBOM_UTF8 | |
74 | }; | |
75 | ||
23324ae1 FM |
76 | /** |
77 | @class wxConvAuto | |
7c913512 | 78 | |
23324ae1 FM |
79 | This class implements a Unicode to/from multibyte converter capable of |
80 | automatically recognizing the encoding of the multibyte text on input. The | |
81 | logic used is very simple: the class uses the BOM (byte order mark) if it's | |
bd0812fe BP |
82 | present and tries to interpret the input as UTF-8 otherwise. If this fails, |
83 | the input is interpreted as being in the default multibyte encoding which | |
84 | can be specified in the constructor of a wxConvAuto instance and, in turn, | |
85 | defaults to the value of GetFallbackEncoding() if not explicitly given. | |
7c913512 | 86 | |
23324ae1 FM |
87 | For the conversion from Unicode to multibyte, the same encoding as was |
88 | previously used for multibyte to Unicode conversion is reused. If there had | |
89 | been no previous multibyte to Unicode conversion, UTF-8 is used by default. | |
bd0812fe BP |
90 | Notice that once the multibyte encoding is automatically detected, it |
91 | doesn't change any more, i.e. it is entirely determined by the first use of | |
92 | wxConvAuto object in the multibyte-to-Unicode direction. However creating a | |
93 | copy of wxConvAuto object, either via the usual copy constructor or | |
94 | assignment operator, or using wxMBConv::Clone(), resets the automatically | |
95 | detected encoding so that the new copy will try to detect the encoding of | |
96 | the input on first use. | |
7c913512 | 97 | |
bd0812fe BP |
98 | This class is used by default in wxWidgets classes and functions reading |
99 | text from files such as wxFile, wxFFile, wxTextFile, wxFileConfig and | |
100 | various stream classes so the encoding set with its SetFallbackEncoding() | |
101 | method will affect how these classes treat input files. In particular, use | |
102 | this method to change the fall-back multibyte encoding used to interpret | |
103 | the contents of the files whose contents isn't valid UTF-8 or to disallow | |
104 | it completely. | |
7c913512 | 105 | |
23324ae1 | 106 | @library{wxbase} |
bd0812fe | 107 | @category{data} |
7c913512 | 108 | |
bd0812fe | 109 | @see @ref overview_mbconv |
23324ae1 FM |
110 | */ |
111 | class wxConvAuto : public wxMBConv | |
112 | { | |
113 | public: | |
114 | /** | |
bd0812fe BP |
115 | Constructs a new wxConvAuto instance. The object will try to detect the |
116 | input of the multibyte text given to its wxMBConv::ToWChar() method | |
117 | automatically but if the automatic detection of Unicode encodings | |
118 | fails, the fall-back encoding @a enc will be used to interpret it as | |
119 | multibyte text. | |
120 | ||
121 | The default value of @a enc, @c wxFONTENCODING_DEFAULT, means that the | |
122 | global default value (which can be set using SetFallbackEncoding()) | |
123 | should be used. As with that method, passing @c wxFONTENCODING_MAX | |
124 | inhibits using this encoding completely so the input multibyte text | |
125 | will always be interpreted as UTF-8 in the absence of BOM and the | |
126 | conversion will fail if the input doesn't form valid UTF-8 sequence. | |
127 | ||
128 | Another special value is @c wxFONTENCODING_SYSTEM which means to use | |
129 | the encoding currently used on the user system, i.e. the encoding | |
130 | returned by wxLocale::GetSystemEncoding(). Any other encoding will be | |
131 | used as is, e.g. passing @c wxFONTENCODING_ISO8859_1 ensures that | |
132 | non-UTF-8 input will be treated as latin1. | |
23324ae1 FM |
133 | */ |
134 | wxConvAuto(wxFontEncoding enc = wxFONTENCODING_DEFAULT); | |
135 | ||
038809c2 VZ |
136 | |
137 | /** | |
138 | Return the detected BOM type. | |
139 | ||
140 | The BOM type is detected after sufficiently many initial bytes have | |
141 | passed through this conversion object so it will always return | |
142 | wxBOM_Unknown immediately after the object creation but may return a | |
143 | different value later. | |
144 | ||
145 | @since 2.9.3 | |
146 | */ | |
147 | wxBOM GetBOM() const; | |
148 | ||
64b91e2d VZ |
149 | /** |
150 | Return a pointer to the characters that makes up this BOM. | |
151 | ||
152 | The returned character count is 2, 3 or 4, or undefined if the return | |
153 | value is NULL. | |
154 | ||
155 | @param bom | |
156 | A valid BOM type, i.e. not wxBOM_Unknown or wxBOM_None. | |
157 | @param count | |
158 | A non-@NULL pointer receiving the number of characters in this BOM. | |
159 | @return | |
160 | Pointer to characters composing the BOM or @NULL if BOM is unknown | |
161 | or invalid. Notice that the returned string is not NUL-terminated | |
162 | and may contain embedded NULs so @a count must be used to handle it | |
163 | correctly. | |
164 | ||
165 | @since 2.9.3 | |
166 | */ | |
167 | const char* GetBOMChars(wxBOM bom, size_t* count); | |
168 | ||
23324ae1 | 169 | /** |
bd0812fe BP |
170 | Disable the use of the fall back encoding: if the input doesn't have a |
171 | BOM and is not valid UTF-8, the conversion will fail. | |
23324ae1 FM |
172 | */ |
173 | static void DisableFallbackEncoding(); | |
174 | ||
175 | /** | |
bd0812fe BP |
176 | Returns the encoding used by default by wxConvAuto if no other encoding |
177 | is explicitly specified in constructor. By default, returns | |
7c913512 | 178 | @c wxFONTENCODING_ISO8859_1 but can be changed using |
bd0812fe | 179 | SetFallbackEncoding(). |
23324ae1 FM |
180 | */ |
181 | static wxFontEncoding GetFallbackEncoding(); | |
182 | ||
183 | /** | |
bd0812fe BP |
184 | Changes the encoding used by default by wxConvAuto if no other encoding |
185 | is explicitly specified in constructor. The default value, which can be | |
186 | retrieved using GetFallbackEncoding(), is @c wxFONTENCODING_ISO8859_1. | |
187 | ||
188 | Special values of @c wxFONTENCODING_SYSTEM or @c wxFONTENCODING_MAX can | |
189 | be used for the @a enc parameter to use the encoding of the current | |
190 | user locale as fall back or not use any encoding for fall back at all, | |
191 | respectively (just as with the similar constructor parameter). However, | |
192 | @c wxFONTENCODING_DEFAULT can't be used here. | |
23324ae1 FM |
193 | */ |
194 | static void SetFallbackEncoding(wxFontEncoding enc); | |
e54c96f1 | 195 | |
038809c2 VZ |
196 | /** |
197 | Return the BOM type of this buffer. | |
198 | ||
199 | This is a helper function which is normally only used internally by | |
200 | wxConvAuto but provided for convenience of the code that wants to | |
201 | detect the encoding of a stream by checking it for BOM presence on its | |
202 | own. | |
203 | ||
204 | @since 2.9.3 | |
205 | */ | |
206 | static wxBOM DetectBOM(const char *src, size_t srcLen); | |
207 | }; |