]>
Commit | Line | Data |
---|---|---|
1 | ///////////////////////////////////////////////////////////////////////////// | |
2 | // Name: strconv.h | |
3 | // Purpose: interface of wxMBConvUTF7 | |
4 | // Author: wxWidgets team | |
5 | // Licence: wxWindows licence | |
6 | ///////////////////////////////////////////////////////////////////////////// | |
7 | ||
8 | /** | |
9 | @class wxMBConv | |
10 | ||
11 | This class is the base class of a hierarchy of classes capable of | |
12 | converting text strings between multibyte (SBCS or DBCS) encodings and | |
13 | Unicode. | |
14 | ||
15 | This is an abstract base class which defines the operations implemented by | |
16 | all different conversion classes. The derived classes don't add any new | |
17 | operations of their own (except, possibly, some non-default constructors) | |
18 | and so you should simply use this class ToWChar() and FromWChar() (or | |
19 | cMB2WC() and cWC2MB()) methods with the objects of the derived class. | |
20 | ||
21 | In the documentation for this and related classes please notice that | |
22 | length of the string refers to the number of characters in the string | |
23 | not counting the terminating @c NUL, if any. While the size of the string | |
24 | is the total number of bytes in the string, including any trailing @c NUL. | |
25 | Thus, length of wide character string @c L"foo" is 3 while its size can | |
26 | be either 8 or 16 depending on whether @c wchar_t is 2 bytes (as | |
27 | under Windows) or 4 (Unix). | |
28 | ||
29 | @library{wxbase} | |
30 | @category{conv} | |
31 | ||
32 | @see wxCSConv, wxEncodingConverter, @ref overview_mbconv | |
33 | */ | |
34 | class wxMBConv | |
35 | { | |
36 | public: | |
37 | /** | |
38 | Trivial default constructor. | |
39 | */ | |
40 | wxMBConv(); | |
41 | ||
42 | /** | |
43 | This pure virtual function is overridden in each of the derived classes | |
44 | to return a new copy of the object it is called on. | |
45 | ||
46 | It is used for copying the conversion objects while preserving their | |
47 | dynamic type. | |
48 | */ | |
49 | virtual wxMBConv* Clone() const = 0; | |
50 | ||
51 | /** | |
52 | This function returns 1 for most of the multibyte encodings in which the | |
53 | string is terminated by a single @c NUL, 2 for UTF-16 and 4 for UTF-32 for | |
54 | which the string is terminated with 2 and 4 @c NUL characters respectively. | |
55 | The other cases are not currently supported and @c wxCONV_FAILED | |
56 | (defined as -1) is returned for them. | |
57 | */ | |
58 | virtual size_t GetMBNulLen() const; | |
59 | ||
60 | /** | |
61 | Returns the maximal value which can be returned by GetMBNulLen() for | |
62 | any conversion object. | |
63 | ||
64 | Currently this value is 4. | |
65 | ||
66 | This method can be used to allocate the buffer with enough space for the | |
67 | trailing @c NUL characters for any encoding. | |
68 | */ | |
69 | static size_t GetMaxMBNulLen(); | |
70 | ||
71 | /** | |
72 | Convert multibyte string to a wide character one. | |
73 | ||
74 | This is the most general function for converting a multibyte string to | |
75 | a wide string, cMB2WC() may be often more convenient, however this | |
76 | function is the most efficient one as it allows to avoid any | |
77 | unnecessary copying. | |
78 | ||
79 | The main case is when @a dst is not @NULL and @a srcLen is not | |
80 | @c wxNO_LEN (which is defined as @c (size_t)-1): then the function | |
81 | converts exactly @a srcLen bytes starting at @a src into wide string | |
82 | which it output to @e dst. If the length of the resulting wide | |
83 | string is greater than @e dstLen, an error is returned. Note that if | |
84 | @a srcLen bytes don't include @c NUL characters, the resulting wide | |
85 | string is not @c NUL-terminated neither. | |
86 | ||
87 | If @a srcLen is @c wxNO_LEN, the function supposes that the string is | |
88 | properly (i.e. as necessary for the encoding handled by this | |
89 | conversion) @c NUL-terminated and converts the entire string, including | |
90 | any trailing @c NUL bytes. In this case the wide string is also @c | |
91 | NUL-terminated. | |
92 | ||
93 | Finally, if @a dst is @NULL, the function returns the length of the | |
94 | needed buffer. | |
95 | ||
96 | Example of use of this function: | |
97 | @code | |
98 | size_t dstLen = conv.ToWChar(NULL, 0, src); | |
99 | if ( dstLen == wxCONV_FAILED ) | |
100 | ... handle error ... | |
101 | wchar_t *dst = new wchar_t[dstLen]; | |
102 | if ( conv.ToWChar(dst, dstLen, src) == wxCONV_FAILED ) | |
103 | ... handle error ... | |
104 | @endcode | |
105 | ||
106 | Notice that when passing the explicit source length the output will | |
107 | @e not be @c NUL terminated if you pass @c strlen(str) as parameter. | |
108 | Either leave @a srcLen as default @c wxNO_LEN or add one to @c strlen | |
109 | result if you want the output to be @c NUL terminated. | |
110 | ||
111 | @param dst | |
112 | Pointer to output buffer of the size of at least @a dstLen or @NULL. | |
113 | @param dstLen | |
114 | Maximal number of characters to be written to the output buffer if | |
115 | @a dst is non-@NULL, unused otherwise. | |
116 | @param src | |
117 | Point to the source string, must not be @NULL. | |
118 | @param srcLen | |
119 | The number of characters of the source string to convert or | |
120 | @c wxNO_LEN (default parameter) to convert everything up to and | |
121 | including the terminating @c NUL character(s). | |
122 | ||
123 | @return | |
124 | The number of character written (or which would have been written | |
125 | if it were non-@NULL) to @a dst or @c wxCONV_FAILED on error. | |
126 | */ | |
127 | virtual size_t ToWChar(wchar_t* dst, size_t dstLen, const char* src, | |
128 | size_t srcLen = wxNO_LEN) const; | |
129 | ||
130 | /** | |
131 | Converts wide character string to multibyte. | |
132 | ||
133 | This function has the same semantics as ToWChar() except that it | |
134 | converts a wide string to multibyte one. As with ToWChar(), it may be | |
135 | more convenient to use cWC2MB() when working with @c NUL terminated | |
136 | strings. | |
137 | ||
138 | @param dst | |
139 | Pointer to output buffer of the size of at least @a dstLen or @NULL. | |
140 | @param dstLen | |
141 | Maximal number of characters to be written to the output buffer if | |
142 | @a dst is non-@NULL, unused otherwise. | |
143 | @param src | |
144 | Point to the source string, must not be @NULL. | |
145 | @param srcLen | |
146 | The number of characters of the source string to convert or | |
147 | @c wxNO_LEN (default parameter) to convert everything up to and | |
148 | including the terminating @c NUL character. | |
149 | ||
150 | @return | |
151 | The number of character written (or which would have been written | |
152 | if it were non-@NULL) to @a dst or @c wxCONV_FAILED on error. | |
153 | */ | |
154 | virtual size_t FromWChar(char* dst, size_t dstLen, const wchar_t* src, | |
155 | size_t srcLen = wxNO_LEN) const; | |
156 | ||
157 | /** | |
158 | Converts from multibyte encoding to Unicode by calling ToWChar() and | |
159 | allocating a temporary wxWCharBuffer to hold the result. | |
160 | ||
161 | This function is a convenient wrapper around ToWChar() as it takes care | |
162 | of allocating the buffer of the necessary size itself. Its parameters | |
163 | have the same meaning as for ToWChar(), in particular @a inLen can be | |
164 | specified explicitly in which case exactly that many characters are | |
165 | converted and @a outLen receives (if non-@NULL) exactly the | |
166 | corresponding number of wide characters, whether the last one of them | |
167 | is @c NUL or not. However if @c inLen is @c wxNO_LEN, then @c outLen | |
168 | doesn't count the trailing @c NUL even if it is always present in this | |
169 | case. | |
170 | ||
171 | Finally notice that if the conversion fails, the returned buffer is | |
172 | invalid and @a outLen is set to 0 (and not @c wxCONV_FAILED for | |
173 | compatibility concerns). | |
174 | */ | |
175 | const wxWCharBuffer cMB2WC(const char* in, | |
176 | size_t inLen, | |
177 | size_t *outLen) const; | |
178 | ||
179 | /** | |
180 | Converts a char buffer to wide char one. | |
181 | ||
182 | This is the most convenient and safest conversion function as you | |
183 | don't have to deal with the buffer lengths directly. Use it if the | |
184 | input buffer is known not to be empty or if you are sure that the | |
185 | conversion is going to succeed -- otherwise, use the overload above to | |
186 | be able to distinguish between empty input and conversion failure. | |
187 | ||
188 | @return | |
189 | The buffer containing the converted text, empty if the input was | |
190 | empty or if the conversion failed. | |
191 | ||
192 | @since 2.9.1 | |
193 | */ | |
194 | const wxWCharBuffer cMB2WC(const wxCharBuffer& buf) const; | |
195 | ||
196 | //@{ | |
197 | /** | |
198 | Converts from multibyte encoding to the current wxChar type (which | |
199 | depends on whether wxUSE_UNICODE is set to 1). | |
200 | ||
201 | If wxChar is char, it returns the parameter unaltered. If wxChar is | |
202 | wchar_t, it returns the result in a wxWCharBuffer. The macro wxMB2WXbuf | |
203 | is defined as the correct return type (without const). | |
204 | */ | |
205 | const char* cMB2WX(const char* psz) const; | |
206 | const wxWCharBuffer cMB2WX(const char* psz) const; | |
207 | //@} | |
208 | ||
209 | /** | |
210 | Converts from Unicode to multibyte encoding by calling FromWChar() and | |
211 | allocating a temporary wxCharBuffer to hold the result. | |
212 | ||
213 | This function is a convenient wrapper around FromWChar() as it takes | |
214 | care of allocating the buffer of necessary size itself. | |
215 | ||
216 | Its parameters have the same meaning as the corresponding parameters of | |
217 | FromWChar(), please see the description of cMB2WC() for more details. | |
218 | */ | |
219 | const wxCharBuffer cWC2MB(const wchar_t* in, | |
220 | size_t inLen, | |
221 | size_t *outLen) const; | |
222 | ||
223 | /** | |
224 | Converts a wide char buffer to char one. | |
225 | ||
226 | This is the most convenient and safest conversion function as you | |
227 | don't have to deal with the buffer lengths directly. Use it if the | |
228 | input buffer is known not to be empty or if you are sure that the | |
229 | conversion is going to succeed -- otherwise, use the overload above to | |
230 | be able to distinguish between empty input and conversion failure. | |
231 | ||
232 | @return | |
233 | The buffer containing the converted text, empty if the input was | |
234 | empty or if the conversion failed. | |
235 | ||
236 | @since 2.9.1 | |
237 | */ | |
238 | const wxCharBuffer cWC2MB(const wxWCharBuffer& buf) const; | |
239 | ||
240 | //@{ | |
241 | /** | |
242 | Converts from Unicode to the current wxChar type. | |
243 | ||
244 | If wxChar is wchar_t, it returns the parameter unaltered. If wxChar is | |
245 | char, it returns the result in a wxCharBuffer. The macro wxWC2WXbuf is | |
246 | defined as the correct return type (without const). | |
247 | */ | |
248 | const wchar_t* cWC2WX(const wchar_t* psz) const; | |
249 | const wxCharBuffer cWC2WX(const wchar_t* psz) const; | |
250 | //@} | |
251 | ||
252 | //@{ | |
253 | /** | |
254 | Converts from the current wxChar type to multibyte encoding. | |
255 | ||
256 | If wxChar is char, it returns the parameter unaltered. If wxChar is | |
257 | wchar_t, it returns the result in a wxCharBuffer. The macro wxWX2MBbuf | |
258 | is defined as the correct return type (without const). | |
259 | */ | |
260 | const char* cWX2MB(const wxChar* psz) const; | |
261 | const wxCharBuffer cWX2MB(const wxChar* psz) const; | |
262 | //@} | |
263 | ||
264 | //@{ | |
265 | /** | |
266 | Converts from the current wxChar type to Unicode. | |
267 | ||
268 | If wxChar is wchar_t, it returns the parameter unaltered. If wxChar is | |
269 | char, it returns the result in a wxWCharBuffer. The macro wxWX2WCbuf is | |
270 | defined as the correct return type (without const). | |
271 | */ | |
272 | const wchar_t* cWX2WC(const wxChar* psz) const; | |
273 | const wxWCharBuffer cWX2WC(const wxChar* psz) const; | |
274 | //@} | |
275 | ||
276 | /** | |
277 | @deprecated This function is deprecated, please use ToWChar() instead. | |
278 | ||
279 | Converts from a string @a in multibyte encoding to Unicode putting up to | |
280 | @a outLen characters into the buffer @e out. | |
281 | ||
282 | If @a out is @NULL, only the length of the string which would result | |
283 | from the conversion is calculated and returned. Note that this is the | |
284 | length and not size, i.e. the returned value does not include the | |
285 | trailing @c NUL. But when the function is called with a non-@NULL @a | |
286 | out buffer, the @a outLen parameter should be one more to allow to | |
287 | properly @c NUL-terminate the string. | |
288 | ||
289 | So to properly use this function you need to write: | |
290 | @code | |
291 | size_t lenConv = conv.MB2WC(NULL, in, 0); | |
292 | if ( lenConv == wxCONV_FAILED ) | |
293 | ... handle error ... | |
294 | // allocate 1 more character for the trailing NUL and also pass | |
295 | // the size of the buffer to the function now | |
296 | wchar_t *out = new wchar_t[lenConv + 1]; | |
297 | if ( conv.MB2WC(out, in, lenConv + 1) == wxCONV_FAILED ) | |
298 | ... handle error ... | |
299 | @endcode | |
300 | For this and other reasons, ToWChar() is strongly recommended as a | |
301 | replacement. | |
302 | ||
303 | @param out | |
304 | The output buffer, may be @NULL if the caller is only | |
305 | interested in the length of the resulting string | |
306 | @param in | |
307 | The NUL-terminated input string, cannot be @NULL | |
308 | @param outLen | |
309 | The length of the output buffer but including | |
310 | NUL, ignored if out is @NULL | |
311 | ||
312 | @return The length of the converted string excluding the trailing NUL. | |
313 | */ | |
314 | virtual size_t MB2WC(wchar_t* out, const char* in, size_t outLen) const; | |
315 | ||
316 | /** | |
317 | @deprecated This function is deprecated, please use FromWChar() instead. | |
318 | ||
319 | Converts from Unicode to multibyte encoding. | |
320 | The semantics of this function (including the return value meaning) is | |
321 | the same as for wxMBConv::MB2WC. Notice that when the function is | |
322 | called with a non-@NULL buffer, the @a n parameter should be the size | |
323 | of the buffer and so it should take into account the trailing @c NUL, | |
324 | which might take two or four bytes for some encodings (UTF-16 and | |
325 | UTF-32) and not one, i.e. GetMBNulLen(). | |
326 | */ | |
327 | virtual size_t WC2MB(char* buf, const wchar_t* psz, size_t n) const; | |
328 | }; | |
329 | ||
330 | ||
331 | /** | |
332 | @class wxMBConvUTF7 | |
333 | ||
334 | This class converts between the UTF-7 encoding and Unicode. | |
335 | It has one predefined instance, @b wxConvUTF7. | |
336 | ||
337 | Notice that, unlike all the other conversion objects, this converter is | |
338 | stateful, i.e. it remembers its state from the last call to its ToWChar() | |
339 | or FromWChar() and assumes it is called on the continuation of the same | |
340 | string when the same method is called again. This assumption is only made | |
341 | if an explicit length is specified as parameter to these functions as if an | |
342 | entire @c NUL terminated string is processed the state doesn't need to be | |
343 | remembered. | |
344 | ||
345 | This also means that, unlike the other predefined conversion objects, | |
346 | @b wxConvUTF7 is @em not thread-safe. | |
347 | ||
348 | @library{wxbase} | |
349 | @category{conv} | |
350 | ||
351 | @see wxMBConvUTF8, @ref overview_mbconv | |
352 | */ | |
353 | class wxMBConvUTF7 : public wxMBConv | |
354 | { | |
355 | }; | |
356 | ||
357 | ||
358 | ||
359 | /** | |
360 | @class wxMBConvUTF8 | |
361 | ||
362 | This class converts between the UTF-8 encoding and Unicode. | |
363 | It has one predefined instance, @b wxConvUTF8. | |
364 | ||
365 | @library{wxbase} | |
366 | @category{conv} | |
367 | ||
368 | @see wxMBConvUTF7, @ref overview_mbconv | |
369 | */ | |
370 | class wxMBConvUTF8 : public wxMBConv | |
371 | { | |
372 | }; | |
373 | ||
374 | ||
375 | ||
376 | /** | |
377 | @class wxMBConvUTF16 | |
378 | ||
379 | This class is used to convert between multibyte encodings and UTF-16 Unicode | |
380 | encoding (also known as UCS-2). | |
381 | ||
382 | Unlike UTF-8 encoding, UTF-16 uses words and not bytes and hence depends | |
383 | on the byte ordering: big or little endian. Hence this class is provided in | |
384 | two versions: wxMBConvUTF16LE and wxMBConvUTF16BE and wxMBConvUTF16 itself | |
385 | is just a typedef for one of them (native for the given platform, e.g. LE | |
386 | under Windows and BE under Mac). | |
387 | ||
388 | @library{wxbase} | |
389 | @category{conv} | |
390 | ||
391 | @see wxMBConvUTF8, wxMBConvUTF32, @ref overview_mbconv | |
392 | */ | |
393 | class wxMBConvUTF16 : public wxMBConv | |
394 | { | |
395 | }; | |
396 | ||
397 | ||
398 | /** | |
399 | @class wxMBConvUTF32 | |
400 | ||
401 | This class is used to convert between multibyte encodings and UTF-32 | |
402 | Unicode encoding (also known as UCS-4). | |
403 | Unlike UTF-8 encoding, UTF-32 uses (double) words and not bytes and hence | |
404 | depends on the byte ordering: big or little endian. Hence this class is | |
405 | provided in two versions: wxMBConvUTF32LE and wxMBConvUTF32BE and | |
406 | wxMBConvUTF32 itself is just a typedef for one of them (native for the | |
407 | given platform, e.g. LE under Windows and BE under Mac). | |
408 | ||
409 | @library{wxbase} | |
410 | @category{conv} | |
411 | ||
412 | @see wxMBConvUTF8, wxMBConvUTF16, @ref overview_mbconv | |
413 | */ | |
414 | class wxMBConvUTF32 : public wxMBConv | |
415 | { | |
416 | }; | |
417 | ||
418 | ||
419 | ||
420 | ||
421 | /** | |
422 | @class wxCSConv | |
423 | ||
424 | This class converts between any character set supported by the system and | |
425 | Unicode. | |
426 | ||
427 | Please notice that this class uses system-provided conversion functions, | |
428 | e.g. @c MultiByteToWideChar() and @c WideCharToMultiByte() under MSW and @c | |
429 | iconv(3) under Unix systems and as such may support different encodings and | |
430 | different encoding names on different platforms (although all relatively | |
431 | common encodings are supported should be supported everywhere). | |
432 | ||
433 | It has one predefined instance, @b wxConvLocal, for the default user | |
434 | character set. | |
435 | ||
436 | @library{wxbase} | |
437 | @category{conv} | |
438 | ||
439 | @see wxMBConv, wxEncodingConverter, @ref overview_mbconv | |
440 | */ | |
441 | class wxCSConv : public wxMBConv | |
442 | { | |
443 | public: | |
444 | /** | |
445 | Constructor. | |
446 | ||
447 | You can specify the name of the character set you want to convert | |
448 | from/to. If the character set name is not recognized, ISO 8859-1 is | |
449 | used as fall back, use IsOk() to test for this. | |
450 | ||
451 | @param charset The name of the encoding, shouldn't be empty. | |
452 | */ | |
453 | wxCSConv(const wxString& charset); | |
454 | ||
455 | /** | |
456 | Constructor. | |
457 | ||
458 | You can specify an encoding constant for the character set you want to | |
459 | convert from/to. Use IsOk() after construction to check whether the | |
460 | encoding is supported by the current system. | |
461 | ||
462 | @param encoding Any valid (i.e. not wxFONTENCODING_MAX) font encoding. | |
463 | */ | |
464 | wxCSConv(wxFontEncoding encoding); | |
465 | ||
466 | /** | |
467 | Returns @true if the charset (or the encoding) given at constructor is | |
468 | really available to use. | |
469 | ||
470 | Returns @false if ISO 8859-1 will be used instead. | |
471 | ||
472 | Note this does not mean that a given string will be correctly | |
473 | converted. A malformed string may still make conversion functions | |
474 | return @c wxCONV_FAILED. | |
475 | ||
476 | @since 2.8.2 | |
477 | */ | |
478 | bool IsOk() const; | |
479 | }; | |
480 | ||
481 | ||
482 | ||
483 | /** | |
484 | Conversion object used for converting file names from their external | |
485 | representation to the one used inside the program. | |
486 | ||
487 | @b wxConvFileName converts filenames between filesystem multibyte encoding | |
488 | and Unicode. @b wxConvFileName can also be set to a something else at | |
489 | run-time which is used e.g. by wxGTK to use an object which checks the | |
490 | environment variable @b G_FILESYSTEM_ENCODING indicating that filenames | |
491 | should not be interpreted as UTF8 and also for converting invalid UTF8 | |
492 | characters (e.g. if there is a filename in iso8859_1) to strings with octal | |
493 | values. | |
494 | ||
495 | Since some platforms (such as Win32) use Unicode in the filenames, | |
496 | and others (such as Unix) use multibyte encodings, this object should only | |
497 | be used directly if wxMBFILES is defined to 1. A convenience macro, | |
498 | @c wxFNCONV, is defined to @c wxConvFileName->cWX2MB in this case. You | |
499 | could use it like this: | |
500 | ||
501 | @code | |
502 | wxChar *name = "rawfile.doc"; | |
503 | FILE *fil = fopen(wxFNCONV(name), "r"); | |
504 | @endcode | |
505 | ||
506 | (although it would be better to just use wxFopen(name, "r") in this | |
507 | particular case, you only need to use this object for functions taking file | |
508 | names not wrapped by wxWidgets.) | |
509 | ||
510 | @library{wxbase} | |
511 | @category{conv} | |
512 | ||
513 | @see @ref overview_mbconv | |
514 | */ | |
515 | extern wxMBConv* wxConvFileName; |