]>
Commit | Line | Data |
---|---|---|
23324ae1 FM |
1 | ///////////////////////////////////////////////////////////////////////////// |
2 | // Name: strconv.h | |
e54c96f1 | 3 | // Purpose: interface of wxMBConvUTF7 |
23324ae1 FM |
4 | // Author: wxWidgets team |
5 | // RCS-ID: $Id$ | |
6 | // Licence: wxWindows license | |
7 | ///////////////////////////////////////////////////////////////////////////// | |
8 | ||
9 | /** | |
f501c3a9 | 10 | @class wxMBConv |
7c913512 | 11 | |
f501c3a9 VZ |
12 | This class is the base class of a hierarchy of classes capable of |
13 | converting text strings between multibyte (SBCS or DBCS) encodings and | |
14 | Unicode. | |
15 | ||
16 | This is an abstract base class which defines the operations implemented by | |
17 | all different conversion classes. The derived classes don't add any new | |
18 | operations of their own (except, possibly, some non-default constructors) | |
19 | and so you should simply use this class ToWChar() and FromWChar() (or | |
20 | cMB2WC() and cWC2MB()) methods with the objects of the derived class. | |
21 | ||
22 | In the documentation for this and related classes please notice that | |
23 | length of the string refers to the number of characters in the string | |
24 | not counting the terminating @c NUL, if any. While the size of the string | |
25 | is the total number of bytes in the string, including any trailing @c NUL. | |
26 | Thus, length of wide character string @c L"foo" is 3 while its size can | |
27 | be either 8 or 16 depending on whether @c wchar_t is 2 bytes (as | |
28 | under Windows) or 4 (Unix). | |
7c913512 | 29 | |
23324ae1 | 30 | @library{wxbase} |
79b40dcf | 31 | @category{conv} |
7c913512 | 32 | |
4701dc09 | 33 | @see wxCSConv, wxEncodingConverter, @ref overview_mbconv |
23324ae1 | 34 | */ |
f501c3a9 | 35 | class wxMBConv |
23324ae1 FM |
36 | { |
37 | public: | |
38 | /** | |
f501c3a9 | 39 | Trivial default constructor. |
23324ae1 | 40 | */ |
f501c3a9 | 41 | wxMBConv(); |
23324ae1 FM |
42 | |
43 | /** | |
f501c3a9 VZ |
44 | This pure virtual function is overridden in each of the derived classes |
45 | to return a new copy of the object it is called on. | |
7c913512 | 46 | |
f501c3a9 VZ |
47 | It is used for copying the conversion objects while preserving their |
48 | dynamic type. | |
49 | */ | |
50 | virtual wxMBConv* Clone() const = 0; | |
7c913512 | 51 | |
23324ae1 | 52 | /** |
f501c3a9 VZ |
53 | This function returns 1 for most of the multibyte encodings in which the |
54 | string is terminated by a single @c NUL, 2 for UTF-16 and 4 for UTF-32 for | |
55 | which the string is terminated with 2 and 4 @c NUL characters respectively. | |
56 | The other cases are not currently supported and @c wxCONV_FAILED | |
57 | (defined as -1) is returned for them. | |
23324ae1 | 58 | */ |
adaaa686 | 59 | virtual size_t GetMBNulLen() const; |
23324ae1 FM |
60 | |
61 | /** | |
f501c3a9 VZ |
62 | Returns the maximal value which can be returned by GetMBNulLen() for |
63 | any conversion object. | |
23324ae1 | 64 | |
f501c3a9 | 65 | Currently this value is 4. |
23324ae1 | 66 | |
f501c3a9 VZ |
67 | This method can be used to allocate the buffer with enough space for the |
68 | trailing @c NUL characters for any encoding. | |
69 | */ | |
70 | const size_t GetMaxMBNulLen(); | |
e54c96f1 | 71 | |
f501c3a9 VZ |
72 | /** |
73 | Convert multibyte string to a wide character one. | |
7c913512 | 74 | |
f501c3a9 VZ |
75 | This is the most general function for converting a multibyte string to |
76 | a wide string, cMB2WC() may be often more convenient, however this | |
77 | function is the most efficient one as it allows to avoid any | |
78 | unnecessary copying. | |
7c913512 | 79 | |
f501c3a9 VZ |
80 | The main case is when @a dst is not @NULL and @a srcLen is not |
81 | @c wxNO_LEN (which is defined as @c (size_t)-1): then the function | |
82 | converts exactly @a srcLen bytes starting at @a src into wide string | |
83 | which it output to @e dst. If the length of the resulting wide | |
84 | string is greater than @e dstLen, an error is returned. Note that if | |
85 | @a srcLen bytes don't include @c NUL characters, the resulting wide | |
86 | string is not @c NUL-terminated neither. | |
7c913512 | 87 | |
f501c3a9 VZ |
88 | If @a srcLen is @c wxNO_LEN, the function supposes that the string is |
89 | properly (i.e. as necessary for the encoding handled by this | |
90 | conversion) @c NUL-terminated and converts the entire string, including | |
91 | any trailing @c NUL bytes. In this case the wide string is also @c | |
92 | NUL-terminated. | |
93 | ||
94 | Finally, if @a dst is @NULL, the function returns the length of the | |
95 | needed buffer. | |
96 | ||
97 | Example of use of this function: | |
98 | @code | |
99 | size_t dstLen = conv.ToWChar(NULL, 0, src); | |
100 | if ( dstLen == wxCONV_FAILED ) | |
101 | ... handle error ... | |
102 | wchar_t *dst = new wchar_t[dstLen]; | |
103 | if ( conv.ToWChar(dst, dstLen, src) == wxCONV_FAILED ) | |
104 | ... handle error ... | |
105 | @endcode | |
106 | ||
107 | Notice that when passing the explicit source length the output will | |
108 | @e not be @c NUL terminated if you pass @c strlen(str) as parameter. | |
109 | Either leave @a srcLen as default @c wxNO_LEN or add one to @c strlen | |
110 | result if you want the output to be @c NUL terminated. | |
111 | ||
112 | @param dst | |
113 | Pointer to output buffer of the size of at least @a dstLen or @NULL. | |
114 | @param dstLen | |
115 | Maximal number of characters to be written to the output buffer if | |
4050e98d | 116 | @a dst is non-@NULL, unused otherwise. |
f501c3a9 VZ |
117 | @param src |
118 | Point to the source string, must not be @NULL. | |
4701dc09 FM |
119 | @param srcLen |
120 | The number of characters of the source string to convert or | |
121 | @c wxNO_LEN (default parameter) to convert everything up to and | |
f501c3a9 | 122 | including the terminating @c NUL character(s). |
4701dc09 | 123 | |
f501c3a9 VZ |
124 | @return |
125 | The number of character written (or which would have been written | |
126 | if it were non-@NULL) to @a dst or @c wxCONV_FAILED on error. | |
23324ae1 | 127 | */ |
f501c3a9 VZ |
128 | virtual size_t ToWChar(wchar_t* dst, size_t dstLen, |
129 | const char* src, | |
130 | size_t srcLen = wxNO_LEN) const; | |
23324ae1 FM |
131 | |
132 | /** | |
f501c3a9 VZ |
133 | Converts wide character string to multibyte. |
134 | ||
135 | This function has the same semantics as ToWChar() except that it | |
136 | converts a wide string to multibyte one. As with ToWChar(), it may be | |
137 | more convenient to use cWC2MB() when working with @c NUL terminated | |
138 | strings. | |
139 | ||
140 | @param dst | |
141 | Pointer to output buffer of the size of at least @a dstLen or @NULL. | |
142 | @param dstLen | |
143 | Maximal number of characters to be written to the output buffer if | |
4050e98d | 144 | @a dst is non-@NULL, unused otherwise. |
f501c3a9 VZ |
145 | @param src |
146 | Point to the source string, must not be @NULL. | |
4701dc09 FM |
147 | @param srcLen |
148 | The number of characters of the source string to convert or | |
149 | @c wxNO_LEN (default parameter) to convert everything up to and | |
f501c3a9 | 150 | including the terminating @c NUL character. |
4701dc09 | 151 | |
f501c3a9 VZ |
152 | @return |
153 | The number of character written (or which would have been written | |
154 | if it were non-@NULL) to @a dst or @c wxCONV_FAILED on error. | |
23324ae1 | 155 | */ |
f501c3a9 VZ |
156 | virtual size_t FromWChar(char* dst, size_t dstLen, |
157 | const wchar_t* src, | |
158 | size_t srcLen = wxNO_LEN) const; | |
23324ae1 | 159 | |
f501c3a9 | 160 | /** |
f6a02087 | 161 | Converts from multibyte encoding to Unicode by calling ToWChar() and |
f501c3a9 VZ |
162 | allocating a temporary wxWCharBuffer to hold the result. |
163 | ||
f6a02087 VZ |
164 | This function is a convenient wrapper around ToWChar() as it takes care |
165 | of allocating the buffer of the necessary size itself. Its parameters | |
166 | have the same meaning as for ToWChar(), in particular @a inLen can be | |
167 | specified explicitly in which case exactly that many characters are | |
168 | converted and @a outLen receives (if non-@NULL) exactly the | |
169 | corresponding number of wide characters, whether the last one of them | |
170 | is @c NUL or not. However if @c inLen is @c wxNO_LEN, then @c outLen | |
171 | doesn't count the trailing @c NUL even if it is always present in this | |
172 | case. | |
173 | ||
174 | Finally notice that if the conversion fails, the returned buffer is | |
175 | invalid and @a outLen is set to 0 (and not @c wxCONV_FAILED for | |
176 | compatibility concerns). | |
f501c3a9 | 177 | */ |
f6a02087 VZ |
178 | const wxWCharBuffer cMB2WC(const char* in, |
179 | size_t inLen = wxNO_LEN, | |
180 | size_t *outLen = NULL) const; | |
7c913512 | 181 | |
f501c3a9 VZ |
182 | //@{ |
183 | /** | |
184 | Converts from multibyte encoding to the current wxChar type (which | |
185 | depends on whether wxUSE_UNICODE is set to 1). | |
7c913512 | 186 | |
f501c3a9 VZ |
187 | If wxChar is char, it returns the parameter unaltered. If wxChar is |
188 | wchar_t, it returns the result in a wxWCharBuffer. The macro wxMB2WXbuf | |
189 | is defined as the correct return type (without const). | |
190 | */ | |
191 | const char* cMB2WX(const char* psz) const; | |
192 | const wxWCharBuffer cMB2WX(const char* psz) const; | |
193 | //@} | |
7c913512 | 194 | |
23324ae1 | 195 | /** |
f6a02087 | 196 | Converts from Unicode to multibyte encoding by calling FromWChar() and |
f501c3a9 VZ |
197 | allocating a temporary wxCharBuffer to hold the result. |
198 | ||
f6a02087 VZ |
199 | This function is a convenient wrapper around FromWChar() as it takes |
200 | care of allocating the buffer of necessary size itself. | |
201 | ||
202 | Its parameters have the same meaning as the corresponding parameters of | |
203 | FromWChar(), please see the description of cMB2WC() for more details. | |
23324ae1 | 204 | */ |
f6a02087 VZ |
205 | const wxCharBuffer cWC2MB(const wchar_t* in, |
206 | size_t inLen = wxNO_LEN, | |
207 | size_t *outLen = NULL) const; | |
79b40dcf | 208 | |
f501c3a9 | 209 | //@{ |
ee0b7af0 | 210 | /** |
f501c3a9 VZ |
211 | Converts from Unicode to the current wxChar type. |
212 | ||
213 | If wxChar is wchar_t, it returns the parameter unaltered. If wxChar is | |
214 | char, it returns the result in a wxCharBuffer. The macro wxWC2WXbuf is | |
215 | defined as the correct return type (without const). | |
ee0b7af0 | 216 | */ |
f501c3a9 VZ |
217 | const wchar_t* cWC2WX(const wchar_t* psz) const; |
218 | const wxCharBuffer cWC2WX(const wchar_t* psz) const; | |
219 | //@} | |
23324ae1 | 220 | |
f501c3a9 | 221 | //@{ |
23324ae1 | 222 | /** |
f501c3a9 VZ |
223 | Converts from the current wxChar type to multibyte encoding. |
224 | ||
225 | If wxChar is char, it returns the parameter unaltered. If wxChar is | |
226 | wchar_t, it returns the result in a wxCharBuffer. The macro wxWX2MBbuf | |
227 | is defined as the correct return type (without const). | |
23324ae1 | 228 | */ |
f501c3a9 VZ |
229 | const char* cWX2MB(const wxChar* psz) const; |
230 | const wxCharBuffer cWX2MB(const wxChar* psz) const; | |
231 | //@} | |
23324ae1 | 232 | |
f501c3a9 | 233 | //@{ |
23324ae1 | 234 | /** |
f501c3a9 | 235 | Converts from the current wxChar type to Unicode. |
3c4f71cc | 236 | |
f501c3a9 VZ |
237 | If wxChar is wchar_t, it returns the parameter unaltered. If wxChar is |
238 | char, it returns the result in a wxWCharBuffer. The macro wxWX2WCbuf is | |
239 | defined as the correct return type (without const). | |
23324ae1 | 240 | */ |
f501c3a9 VZ |
241 | const wchar_t* cWX2WC(const wxChar* psz) const; |
242 | const wxWCharBuffer cWX2WC(const wxChar* psz) const; | |
243 | //@} | |
23324ae1 FM |
244 | |
245 | /** | |
f501c3a9 VZ |
246 | @deprecated This function is deprecated, please use ToWChar() instead. |
247 | ||
248 | Converts from a string @a in in multibyte encoding to Unicode putting up to | |
249 | @a outLen characters into the buffer @e out. | |
250 | ||
251 | If @a out is @NULL, only the length of the string which would result | |
252 | from the conversion is calculated and returned. Note that this is the | |
253 | length and not size, i.e. the returned value does not include the | |
254 | trailing @c NUL. But when the function is called with a non-@NULL @a | |
255 | out buffer, the @a outLen parameter should be one more to allow to | |
256 | properly @c NUL-terminate the string. | |
257 | ||
258 | @param out | |
259 | The output buffer, may be @NULL if the caller is only | |
260 | interested in the length of the resulting string | |
261 | @param in | |
262 | The NUL-terminated input string, cannot be @NULL | |
263 | @param outLen | |
264 | The length of the output buffer but including | |
265 | NUL, ignored if out is @NULL | |
266 | ||
267 | @return The length of the converted string excluding the trailing NUL. | |
23324ae1 | 268 | */ |
f501c3a9 | 269 | virtual size_t MB2WC(wchar_t* out, const char* in, size_t outLen) const; |
23324ae1 FM |
270 | |
271 | /** | |
f501c3a9 VZ |
272 | @deprecated This function is deprecated, please use FromWChar() instead. |
273 | ||
274 | Converts from Unicode to multibyte encoding. | |
275 | The semantics of this function (including the return value meaning) is | |
276 | the same as for wxMBConv::MB2WC. Notice that when the function is | |
277 | called with a non-@NULL buffer, the @a n parameter should be the size | |
278 | of the buffer and so it should take into account the trailing @c NUL, | |
279 | which might take two or four bytes for some encodings (UTF-16 and | |
280 | UTF-32) and not one. | |
23324ae1 | 281 | */ |
f501c3a9 | 282 | virtual size_t WC2MB(char* buf, const wchar_t* psz, size_t n) const; |
23324ae1 FM |
283 | }; |
284 | ||
285 | ||
286 | /** | |
f501c3a9 | 287 | @class wxMBConvUTF7 |
7c913512 | 288 | |
f501c3a9 VZ |
289 | This class converts between the UTF-7 encoding and Unicode. |
290 | It has one predefined instance, @b wxConvUTF7. | |
7c913512 | 291 | |
9d653e81 VZ |
292 | Notice that, unlike all the other conversion objects, this converter is |
293 | stateful, i.e. it remembers its state from the last call to its ToWChar() | |
294 | or FromWChar() and assumes it is called on the continuation of the same | |
295 | string when the same method is called again. This assumption is only made | |
296 | if an explicit length is specified as parameter to these functions as if an | |
297 | entire @c NUL terminated string is processed the state doesn't need to be | |
298 | remembered. | |
299 | ||
300 | This also means that, unlike the other predefined conversion objects, | |
301 | @b wxConvUTF7 is @em not thread-safe. | |
302 | ||
f501c3a9 VZ |
303 | @library{wxbase} |
304 | @category{conv} | |
305 | ||
4701dc09 | 306 | @see wxMBConvUTF8, @ref overview_mbconv |
f501c3a9 VZ |
307 | */ |
308 | class wxMBConvUTF7 : public wxMBConv | |
309 | { | |
310 | }; | |
7c913512 | 311 | |
7c913512 | 312 | |
f501c3a9 VZ |
313 | |
314 | /** | |
315 | @class wxMBConvUTF8 | |
f501c3a9 VZ |
316 | |
317 | This class converts between the UTF-8 encoding and Unicode. | |
318 | It has one predefined instance, @b wxConvUTF8. | |
7c913512 | 319 | |
23324ae1 | 320 | @library{wxbase} |
79b40dcf | 321 | @category{conv} |
7c913512 | 322 | |
4701dc09 | 323 | @see wxMBConvUTF7, @ref overview_mbconv |
23324ae1 | 324 | */ |
f501c3a9 | 325 | class wxMBConvUTF8 : public wxMBConv |
23324ae1 | 326 | { |
23324ae1 FM |
327 | }; |
328 | ||
329 | ||
e54c96f1 | 330 | |
f501c3a9 VZ |
331 | /** |
332 | @class wxMBConvUTF16 | |
f501c3a9 VZ |
333 | |
334 | This class is used to convert between multibyte encodings and UTF-16 Unicode | |
335 | encoding (also known as UCS-2). | |
336 | ||
337 | Unlike UTF-8 encoding, UTF-16 uses words and not bytes and hence depends | |
338 | on the byte ordering: big or little endian. Hence this class is provided in | |
339 | two versions: wxMBConvUTF16LE and wxMBConvUTF16BE and wxMBConvUTF16 itself | |
340 | is just a typedef for one of them (native for the given platform, e.g. LE | |
341 | under Windows and BE under Mac). | |
342 | ||
343 | @library{wxbase} | |
344 | @category{conv} | |
345 | ||
4701dc09 | 346 | @see wxMBConvUTF8, wxMBConvUTF32, @ref overview_mbconv |
f501c3a9 VZ |
347 | */ |
348 | class wxMBConvUTF16 : public wxMBConv | |
349 | { | |
350 | }; | |
351 | ||
352 | ||
23324ae1 FM |
353 | /** |
354 | @class wxMBConvUTF32 | |
7c913512 | 355 | |
f501c3a9 VZ |
356 | This class is used to convert between multibyte encodings and UTF-32 |
357 | Unicode encoding (also known as UCS-4). | |
358 | Unlike UTF-8 encoding, UTF-32 uses (double) words and not bytes and hence | |
359 | depends on the byte ordering: big or little endian. Hence this class is | |
360 | provided in two versions: wxMBConvUTF32LE and wxMBConvUTF32BE and | |
361 | wxMBConvUTF32 itself is just a typedef for one of them (native for the | |
362 | given platform, e.g. LE under Windows and BE under Mac). | |
7c913512 | 363 | |
23324ae1 | 364 | @library{wxbase} |
79b40dcf | 365 | @category{conv} |
7c913512 | 366 | |
4701dc09 | 367 | @see wxMBConvUTF8, wxMBConvUTF16, @ref overview_mbconv |
23324ae1 FM |
368 | */ |
369 | class wxMBConvUTF32 : public wxMBConv | |
370 | { | |
23324ae1 FM |
371 | }; |
372 | ||
373 | ||
e54c96f1 | 374 | |
f501c3a9 | 375 | |
23324ae1 | 376 | /** |
f501c3a9 | 377 | @class wxCSConv |
7c913512 | 378 | |
f501c3a9 VZ |
379 | This class converts between any character set supported by the system and |
380 | Unicode. | |
7c913512 | 381 | |
f501c3a9 VZ |
382 | Please notice that this class uses system-provided conversion functions, |
383 | e.g. @c MultiByteToWideChar() and @c WideCharToMultiByte() under MSW and @c | |
384 | iconv(3) under Unix systems and as such may support different encodings and | |
385 | different encoding names on different platforms (although all relatively | |
386 | common encodings are supported should be supported everywhere). | |
387 | ||
388 | It has one predefined instance, @b wxConvLocal, for the default user | |
389 | character set. | |
7c913512 | 390 | |
23324ae1 | 391 | @library{wxbase} |
79b40dcf | 392 | @category{conv} |
7c913512 | 393 | |
4701dc09 | 394 | @see wxMBConv, wxEncodingConverter, @ref overview_mbconv |
23324ae1 | 395 | */ |
f501c3a9 | 396 | class wxCSConv : public wxMBConv |
23324ae1 FM |
397 | { |
398 | public: | |
399 | /** | |
f501c3a9 | 400 | Constructor. |
23324ae1 | 401 | |
f501c3a9 VZ |
402 | You can specify the name of the character set you want to convert |
403 | from/to. If the character set name is not recognized, ISO 8859-1 is | |
404 | used as fall back, use IsOk() to test for this. | |
23324ae1 | 405 | |
f501c3a9 | 406 | @param charset The name of the encoding, shouldn't be empty. |
23324ae1 | 407 | */ |
f501c3a9 | 408 | wxCSConv(const wxString& charset); |
23324ae1 FM |
409 | |
410 | /** | |
f501c3a9 | 411 | Constructor. |
23324ae1 | 412 | |
f501c3a9 VZ |
413 | You can specify an encoding constant for the character set you want to |
414 | convert from/to. Use IsOk() after construction to check whether the | |
415 | encoding is supported by the current system. | |
416 | ||
417 | @param encoding Any valid (i.e. not wxFONTENCODING_MAX) font encoding. | |
23324ae1 | 418 | */ |
f501c3a9 | 419 | wxCSConv(wxFontEncoding encoding); |
23324ae1 FM |
420 | |
421 | /** | |
f501c3a9 VZ |
422 | Returns @true if the charset (or the encoding) given at constructor is |
423 | really available to use. | |
3c4f71cc | 424 | |
f501c3a9 | 425 | Returns @false if ISO 8859-1 will be used instead. |
3c4f71cc | 426 | |
f501c3a9 VZ |
427 | Note this does not mean that a given string will be correctly |
428 | converted. A malformed string may still make conversion functions | |
429 | return @c wxCONV_FAILED. | |
23324ae1 | 430 | |
f501c3a9 | 431 | @since 2.8.2 |
23324ae1 | 432 | */ |
f501c3a9 VZ |
433 | bool IsOk() const; |
434 | }; | |
23324ae1 | 435 | |
23324ae1 | 436 | |
23324ae1 | 437 | |
f501c3a9 VZ |
438 | /** |
439 | @class wxMBConvFile | |
23324ae1 | 440 | |
f501c3a9 VZ |
441 | This class used to define the class instance @b wxConvFileName, but |
442 | nowadays @b wxConvFileName is either of type wxConvLibc (on most platforms) | |
443 | or wxConvUTF8 (on MacOS X). | |
23324ae1 | 444 | |
f501c3a9 VZ |
445 | @b wxConvFileName converts filenames between filesystem multibyte encoding |
446 | and Unicode. @b wxConvFileName can also be set to a something else at | |
447 | run-time which is used e.g. by wxGTK to use a class which checks the | |
448 | environment variable @b G_FILESYSTEM_ENCODING indicating that filenames | |
449 | should not be interpreted as UTF8 and also for converting invalid UTF8 | |
450 | characters (e.g. if there is a filename in iso8859_1) to strings with octal | |
451 | values. | |
23324ae1 | 452 | |
f501c3a9 VZ |
453 | Since some platforms (such as Win32) use Unicode in the filenames, |
454 | and others (such as Unix) use multibyte encodings, this class should only | |
455 | be used directly if wxMBFILES is defined to 1. A convenience macro, | |
456 | @c wxFNCONV, is defined to @c wxConvFileName->cWX2MB in this case. You | |
457 | could use it like this: | |
23324ae1 | 458 | |
f501c3a9 VZ |
459 | @code |
460 | wxChar *name = wxT("rawfile.doc"); | |
461 | FILE *fil = fopen(wxFNCONV(name), "r"); | |
462 | @endcode | |
463 | ||
464 | (although it would be better to just use wxFopen(name, "r") in this | |
465 | particular case, you only need to use this class for functions taking file | |
466 | names not wrapped by wxWidgets.) | |
e54c96f1 | 467 | |
f501c3a9 VZ |
468 | @library{wxbase} |
469 | @category{conv} | |
470 | ||
4701dc09 | 471 | @see @ref overview_mbconv |
f501c3a9 VZ |
472 | */ |
473 | class wxMBConvFile : public wxMBConv | |
474 | { | |
475 | public: | |
476 | }; |