]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
added the mention of library in which each class is defined to the documentation...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef __SALFORDC__
48 #include <clib.h>
49 #endif
50
51 #ifdef HAVE_ICONV
52 #include <iconv.h>
53 #include "wx/thread.h"
54 #endif
55
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
58
59 #ifdef __DARWIN__
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
62
63
64 #define TRACE_STRCONV _T("strconv")
65
66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
67 // be 4 bytes
68 #if SIZEOF_WCHAR_T == 2
69 #define WC_UTF16
70 #endif
71
72
73 // ============================================================================
74 // implementation
75 // ============================================================================
76
77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
78 static bool NotAllNULs(const char *p, size_t n)
79 {
80 while ( n && *p++ == '\0' )
81 n--;
82
83 return n != 0;
84 }
85
86 // ----------------------------------------------------------------------------
87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
88 // ----------------------------------------------------------------------------
89
90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
91 {
92 if (input <= 0xffff)
93 {
94 if (output)
95 *output = (wxUint16) input;
96
97 return 1;
98 }
99 else if (input >= 0x110000)
100 {
101 return wxCONV_FAILED;
102 }
103 else
104 {
105 if (output)
106 {
107 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
108 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
109 }
110
111 return 2;
112 }
113 }
114
115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
116 {
117 if ((*input < 0xd800) || (*input > 0xdfff))
118 {
119 output = *input;
120 return 1;
121 }
122 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
123 {
124 output = *input;
125 return wxCONV_FAILED;
126 }
127 else
128 {
129 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
130 return 2;
131 }
132 }
133
134 #ifdef WC_UTF16
135 typedef wchar_t wxDecodeSurrogate_t;
136 #else // !WC_UTF16
137 typedef wxUint16 wxDecodeSurrogate_t;
138 #endif // WC_UTF16/!WC_UTF16
139
140 // returns the next UTF-32 character from the wchar_t buffer and advances the
141 // pointer to the character after this one
142 //
143 // if an invalid character is found, *pSrc is set to NULL, the caller must
144 // check for this
145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
146 {
147 wxUint32 out;
148 const size_t
149 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
150 if ( n == wxCONV_FAILED )
151 *pSrc = NULL;
152 else
153 *pSrc += n;
154
155 return out;
156 }
157
158 // ----------------------------------------------------------------------------
159 // wxMBConv
160 // ----------------------------------------------------------------------------
161
162 size_t
163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
164 const char *src, size_t srcLen) const
165 {
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 for ( ;; )
213 {
214 // try to convert the current chunk
215 size_t lenChunk = MB2WC(NULL, src, 0);
216 if ( lenChunk == wxCONV_FAILED )
217 return wxCONV_FAILED;
218
219 lenChunk++; // for the L'\0' at the end of this chunk
220
221 dstWritten += lenChunk;
222
223 if ( lenChunk == 1 )
224 {
225 // nothing left in the input string, conversion succeeded
226 break;
227 }
228
229 if ( dst )
230 {
231 if ( dstWritten > dstLen )
232 return wxCONV_FAILED;
233
234 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
235 return wxCONV_FAILED;
236
237 dst += lenChunk;
238 }
239
240 if ( !srcEnd )
241 {
242 // we convert just one chunk in this case as this is the entire
243 // string anyhow
244 break;
245 }
246
247 // advance the input pointer past the end of this chunk
248 while ( NotAllNULs(src, nulLen) )
249 {
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
254 src += nulLen;
255 }
256
257 src += nulLen; // skipping over its terminator as well
258
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
262 if ( src >= srcEnd )
263 break;
264 }
265
266 return dstWritten;
267 }
268
269 size_t
270 wxMBConv::FromWChar(char *dst, size_t dstLen,
271 const wchar_t *src, size_t srcLen) const
272 {
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten = 0;
275
276 // make a copy of the input string unless it is already properly
277 // NUL-terminated
278 //
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp;
282 if ( srcLen == wxNO_LEN )
283 {
284 srcLen = wxWcslen(src) + 1;
285 }
286 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
287 {
288 // make a copy in order to properly NUL-terminate the string
289 bufTmp = wxWCharBuffer(srcLen);
290 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
291 src = bufTmp;
292 }
293
294 const size_t lenNul = GetMBNulLen();
295 for ( const wchar_t * const srcEnd = src + srcLen;
296 src < srcEnd;
297 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
298 {
299 // try to convert the current chunk
300 size_t lenChunk = WC2MB(NULL, src, 0);
301
302 if ( lenChunk == wxCONV_FAILED )
303 return wxCONV_FAILED;
304
305 lenChunk += lenNul;
306 dstWritten += lenChunk;
307
308 if ( dst )
309 {
310 if ( dstWritten > dstLen )
311 return wxCONV_FAILED;
312
313 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dst += lenChunk;
317 }
318 }
319
320 return dstWritten;
321 }
322
323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
324 {
325 size_t rc = ToWChar(outBuff, outLen, inBuff);
326 if ( rc != wxCONV_FAILED )
327 {
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
330 rc--;
331 }
332
333 return rc;
334 }
335
336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
337 {
338 size_t rc = FromWChar(outBuff, outLen, inBuff);
339 if ( rc != wxCONV_FAILED )
340 {
341 rc -= GetMBNulLen();
342 }
343
344 return rc;
345 }
346
347 wxMBConv::~wxMBConv()
348 {
349 // nothing to do here (necessary for Darwin linking probably)
350 }
351
352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
353 {
354 if ( psz )
355 {
356 // calculate the length of the buffer needed first
357 const size_t nLen = ToWChar(NULL, 0, psz);
358 if ( nLen != wxCONV_FAILED )
359 {
360 // now do the actual conversion
361 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
362
363 // +1 for the trailing NULL
364 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
365 return buf;
366 }
367 }
368
369 return wxWCharBuffer();
370 }
371
372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
373 {
374 if ( pwz )
375 {
376 const size_t nLen = FromWChar(NULL, 0, pwz);
377 if ( nLen != wxCONV_FAILED )
378 {
379 wxCharBuffer buf(nLen - 1);
380 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
381 return buf;
382 }
383 }
384
385 return wxCharBuffer();
386 }
387
388 const wxWCharBuffer
389 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
390 {
391 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
392 if ( dstLen != wxCONV_FAILED )
393 {
394 wxWCharBuffer wbuf(dstLen - 1);
395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
396 {
397 if ( outLen )
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412 }
413
414 const wxCharBuffer
415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
416 {
417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
418 if ( dstLen != wxCONV_FAILED )
419 {
420 // special case of empty input: can't allocate 0 size buffer below as
421 // wxCharBuffer insists on NUL-terminating it
422 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
423 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
424 {
425 if ( outLen )
426 {
427 *outLen = dstLen;
428
429 const size_t nulLen = GetMBNulLen();
430 if ( dstLen >= nulLen &&
431 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
432 {
433 // in this case the output is NUL-terminated and we're not
434 // supposed to count NUL
435 *outLen -= nulLen;
436 }
437 }
438
439 return buf;
440 }
441 }
442
443 if ( outLen )
444 *outLen = 0;
445
446 return wxCharBuffer();
447 }
448
449 // ----------------------------------------------------------------------------
450 // wxMBConvLibc
451 // ----------------------------------------------------------------------------
452
453 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
454 {
455 return wxMB2WC(buf, psz, n);
456 }
457
458 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
459 {
460 return wxWC2MB(buf, psz, n);
461 }
462
463 // ----------------------------------------------------------------------------
464 // wxConvBrokenFileNames
465 // ----------------------------------------------------------------------------
466
467 #ifdef __UNIX__
468
469 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
470 {
471 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
472 wxStricmp(charset, _T("UTF8")) == 0 )
473 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
474 else
475 m_conv = new wxCSConv(charset);
476 }
477
478 #endif // __UNIX__
479
480 // ----------------------------------------------------------------------------
481 // UTF-7
482 // ----------------------------------------------------------------------------
483
484 // Implementation (C) 2004 Fredrik Roubert
485
486 //
487 // BASE64 decoding table
488 //
489 static const unsigned char utf7unb64[] =
490 {
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
497 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
498 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
500 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
501 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
502 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
504 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
505 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
506 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
523 };
524
525 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
526 {
527 size_t len = 0;
528
529 while ( *psz && (!buf || (len < n)) )
530 {
531 unsigned char cc = *psz++;
532 if (cc != '+')
533 {
534 // plain ASCII char
535 if (buf)
536 *buf++ = cc;
537 len++;
538 }
539 else if (*psz == '-')
540 {
541 // encoded plus sign
542 if (buf)
543 *buf++ = cc;
544 len++;
545 psz++;
546 }
547 else // start of BASE64 encoded string
548 {
549 bool lsb, ok;
550 unsigned int d, l;
551 for ( ok = lsb = false, d = 0, l = 0;
552 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
553 psz++ )
554 {
555 d <<= 6;
556 d += cc;
557 for (l += 6; l >= 8; lsb = !lsb)
558 {
559 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
560 if (lsb)
561 {
562 if (buf)
563 *buf++ |= c;
564 len ++;
565 }
566 else
567 {
568 if (buf)
569 *buf = (wchar_t)(c << 8);
570 }
571
572 ok = true;
573 }
574 }
575
576 if ( !ok )
577 {
578 // in valid UTF7 we should have valid characters after '+'
579 return wxCONV_FAILED;
580 }
581
582 if (*psz == '-')
583 psz++;
584 }
585 }
586
587 if ( buf && (len < n) )
588 *buf = '\0';
589
590 return len;
591 }
592
593 //
594 // BASE64 encoding table
595 //
596 static const unsigned char utf7enb64[] =
597 {
598 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
599 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
600 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
601 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
602 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
603 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
604 'w', 'x', 'y', 'z', '0', '1', '2', '3',
605 '4', '5', '6', '7', '8', '9', '+', '/'
606 };
607
608 //
609 // UTF-7 encoding table
610 //
611 // 0 - Set D (directly encoded characters)
612 // 1 - Set O (optional direct characters)
613 // 2 - whitespace characters (optional)
614 // 3 - special characters
615 //
616 static const unsigned char utf7encode[128] =
617 {
618 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
619 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
620 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
622 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
624 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
626 };
627
628 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
629 {
630 size_t len = 0;
631
632 while (*psz && ((!buf) || (len < n)))
633 {
634 wchar_t cc = *psz++;
635 if (cc < 0x80 && utf7encode[cc] < 1)
636 {
637 // plain ASCII char
638 if (buf)
639 *buf++ = (char)cc;
640
641 len++;
642 }
643 #ifndef WC_UTF16
644 else if (((wxUint32)cc) > 0xffff)
645 {
646 // no surrogate pair generation (yet?)
647 return wxCONV_FAILED;
648 }
649 #endif
650 else
651 {
652 if (buf)
653 *buf++ = '+';
654
655 len++;
656 if (cc != '+')
657 {
658 // BASE64 encode string
659 unsigned int lsb, d, l;
660 for (d = 0, l = 0; /*nothing*/; psz++)
661 {
662 for (lsb = 0; lsb < 2; lsb ++)
663 {
664 d <<= 8;
665 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
666
667 for (l += 8; l >= 6; )
668 {
669 l -= 6;
670 if (buf)
671 *buf++ = utf7enb64[(d >> l) % 64];
672 len++;
673 }
674 }
675
676 cc = *psz;
677 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
678 break;
679 }
680
681 if (l != 0)
682 {
683 if (buf)
684 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
685
686 len++;
687 }
688 }
689
690 if (buf)
691 *buf++ = '-';
692 len++;
693 }
694 }
695
696 if (buf && (len < n))
697 *buf = 0;
698
699 return len;
700 }
701
702 // ----------------------------------------------------------------------------
703 // UTF-8
704 // ----------------------------------------------------------------------------
705
706 static wxUint32 utf8_max[]=
707 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
708
709 // boundaries of the private use area we use to (temporarily) remap invalid
710 // characters invalid in a UTF-8 encoded string
711 const wxUint32 wxUnicodePUA = 0x100000;
712 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
713
714 // this table gives the length of the UTF-8 encoding from its first character:
715 unsigned char tableUtf8Lengths[256] = {
716 // single-byte sequences (ASCII):
717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
725
726 // these are invalid:
727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
731 0, 0, // C0,C1
732
733 // two-byte sequences:
734 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
735 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
736
737 // three-byte sequences:
738 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
739
740 // four-byte sequences:
741 4, 4, 4, 4, 4, // F0..F4
742
743 // these are invalid again (5- or 6-byte
744 // sequences and sequences for code points
745 // above U+10FFFF, as restricted by RFC 3629):
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
747 };
748
749 size_t
750 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
751 const char *src, size_t srcLen) const
752 {
753 wchar_t *out = dstLen ? dst : NULL;
754 size_t written = 0;
755
756 if ( srcLen == wxNO_LEN )
757 srcLen = strlen(src) + 1;
758
759 for ( const char *p = src; ; p++ )
760 {
761 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
762 {
763 // all done successfully, just add the trailing NULL if we are not
764 // using explicit length
765 if ( srcLen == wxNO_LEN )
766 {
767 if ( out )
768 {
769 if ( !dstLen )
770 break;
771
772 *out = L'\0';
773 }
774
775 written++;
776 }
777
778 return written;
779 }
780
781 unsigned char c = *p;
782 unsigned len = tableUtf8Lengths[c];
783 if ( !len )
784 break;
785
786 if ( srcLen < len ) // the test works for wxNO_LEN too
787 break;
788
789 if ( srcLen != wxNO_LEN )
790 srcLen -= len;
791
792 if ( out && !dstLen-- )
793 break;
794
795
796 // Char. number range | UTF-8 octet sequence
797 // (hexadecimal) | (binary)
798 // ----------------------+---------------------------------------------
799 // 0000 0000 - 0000 007F | 0xxxxxxx
800 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
801 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
802 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
803 //
804 // Code point value is stored in bits marked with 'x', lowest-order bit
805 // of the value on the right side in the diagram above.
806 // (from RFC 3629)
807
808 // mask to extract lead byte's value ('x' bits above), by sequence length:
809 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
810
811 // mask and value of lead byte's most significant bits, by length:
812 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
813 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
814
815 len--; // it's more convenient to work with 0-based length here
816
817 // extract the lead byte's value bits:
818 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
819 break;
820
821 wxUint32 code = c & leadValueMask[len];
822
823 // all remaining bytes, if any, are handled in the same way regardless of
824 // sequence's length:
825 for ( ; len; --len )
826 {
827 c = *++p;
828 if ( (c & 0xC0) != 0x80 )
829 return wxCONV_FAILED;
830
831 code <<= 6;
832 code |= c & 0x3F;
833 }
834
835 #ifdef WC_UTF16
836 // cast is ok because wchar_t == wxUint16 if WC_UTF16
837 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
838 {
839 if ( out )
840 out++;
841 written++;
842 }
843 #else // !WC_UTF16
844 if ( out )
845 *out = code;
846 #endif // WC_UTF16/!WC_UTF16
847
848 if ( out )
849 out++;
850
851 written++;
852 }
853
854 return wxCONV_FAILED;
855 }
856
857 size_t
858 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
859 const wchar_t *src, size_t srcLen) const
860 {
861 char *out = dstLen ? dst : NULL;
862 size_t written = 0;
863
864 for ( const wchar_t *wp = src; ; wp++ )
865 {
866 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
867 {
868 // all done successfully, just add the trailing NULL if we are not
869 // using explicit length
870 if ( srcLen == wxNO_LEN )
871 {
872 if ( out )
873 {
874 if ( !dstLen )
875 break;
876
877 *out = '\0';
878 }
879
880 written++;
881 }
882
883 return written;
884 }
885
886
887 wxUint32 code;
888 #ifdef WC_UTF16
889 // cast is ok for WC_UTF16
890 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
891 {
892 // skip the next char too as we decoded a surrogate
893 wp++;
894 }
895 #else // wchar_t is UTF-32
896 code = *wp & 0x7fffffff;
897 #endif
898
899 unsigned len;
900 if ( code <= 0x7F )
901 {
902 len = 1;
903 if ( out )
904 {
905 if ( dstLen < len )
906 break;
907
908 out[0] = (char)code;
909 }
910 }
911 else if ( code <= 0x07FF )
912 {
913 len = 2;
914 if ( out )
915 {
916 if ( dstLen < len )
917 break;
918
919 // NB: this line takes 6 least significant bits, encodes them as
920 // 10xxxxxx and discards them so that the next byte can be encoded:
921 out[1] = 0x80 | (code & 0x3F); code >>= 6;
922 out[0] = 0xC0 | code;
923 }
924 }
925 else if ( code < 0xFFFF )
926 {
927 len = 3;
928 if ( out )
929 {
930 if ( dstLen < len )
931 break;
932
933 out[2] = 0x80 | (code & 0x3F); code >>= 6;
934 out[1] = 0x80 | (code & 0x3F); code >>= 6;
935 out[0] = 0xE0 | code;
936 }
937 }
938 else if ( code <= 0x10FFFF )
939 {
940 len = 4;
941 if ( out )
942 {
943 if ( dstLen < len )
944 break;
945
946 out[3] = 0x80 | (code & 0x3F); code >>= 6;
947 out[2] = 0x80 | (code & 0x3F); code >>= 6;
948 out[1] = 0x80 | (code & 0x3F); code >>= 6;
949 out[0] = 0xF0 | code;
950 }
951 }
952 else
953 {
954 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
955 break;
956 }
957
958 if ( out )
959 {
960 out += len;
961 dstLen -= len;
962 }
963
964 written += len;
965 }
966
967 // we only get here if an error occurs during decoding
968 return wxCONV_FAILED;
969 }
970
971 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
972 {
973 if ( m_options == MAP_INVALID_UTF8_NOT )
974 return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
975
976 size_t len = 0;
977
978 while (*psz && ((!buf) || (len < n)))
979 {
980 const char *opsz = psz;
981 bool invalid = false;
982 unsigned char cc = *psz++, fc = cc;
983 unsigned cnt;
984 for (cnt = 0; fc & 0x80; cnt++)
985 fc <<= 1;
986
987 if (!cnt)
988 {
989 // plain ASCII char
990 if (buf)
991 *buf++ = cc;
992 len++;
993
994 // escape the escape character for octal escapes
995 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
996 && cc == '\\' && (!buf || len < n))
997 {
998 if (buf)
999 *buf++ = cc;
1000 len++;
1001 }
1002 }
1003 else
1004 {
1005 cnt--;
1006 if (!cnt)
1007 {
1008 // invalid UTF-8 sequence
1009 invalid = true;
1010 }
1011 else
1012 {
1013 unsigned ocnt = cnt - 1;
1014 wxUint32 res = cc & (0x3f >> cnt);
1015 while (cnt--)
1016 {
1017 cc = *psz;
1018 if ((cc & 0xC0) != 0x80)
1019 {
1020 // invalid UTF-8 sequence
1021 invalid = true;
1022 break;
1023 }
1024
1025 psz++;
1026 res = (res << 6) | (cc & 0x3f);
1027 }
1028
1029 if (invalid || res <= utf8_max[ocnt])
1030 {
1031 // illegal UTF-8 encoding
1032 invalid = true;
1033 }
1034 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1035 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1036 {
1037 // if one of our PUA characters turns up externally
1038 // it must also be treated as an illegal sequence
1039 // (a bit like you have to escape an escape character)
1040 invalid = true;
1041 }
1042 else
1043 {
1044 #ifdef WC_UTF16
1045 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1046 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1047 if (pa == wxCONV_FAILED)
1048 {
1049 invalid = true;
1050 }
1051 else
1052 {
1053 if (buf)
1054 buf += pa;
1055 len += pa;
1056 }
1057 #else // !WC_UTF16
1058 if (buf)
1059 *buf++ = (wchar_t)res;
1060 len++;
1061 #endif // WC_UTF16/!WC_UTF16
1062 }
1063 }
1064
1065 if (invalid)
1066 {
1067 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1068 {
1069 while (opsz < psz && (!buf || len < n))
1070 {
1071 #ifdef WC_UTF16
1072 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1073 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1074 wxASSERT(pa != wxCONV_FAILED);
1075 if (buf)
1076 buf += pa;
1077 opsz++;
1078 len += pa;
1079 #else
1080 if (buf)
1081 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1082 opsz++;
1083 len++;
1084 #endif
1085 }
1086 }
1087 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1088 {
1089 while (opsz < psz && (!buf || len < n))
1090 {
1091 if ( buf && len + 3 < n )
1092 {
1093 unsigned char on = *opsz;
1094 *buf++ = L'\\';
1095 *buf++ = (wchar_t)( L'0' + on / 0100 );
1096 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1097 *buf++ = (wchar_t)( L'0' + on % 010 );
1098 }
1099
1100 opsz++;
1101 len += 4;
1102 }
1103 }
1104 else // MAP_INVALID_UTF8_NOT
1105 {
1106 return wxCONV_FAILED;
1107 }
1108 }
1109 }
1110 }
1111
1112 if (buf && (len < n))
1113 *buf = 0;
1114
1115 return len;
1116 }
1117
1118 static inline bool isoctal(wchar_t wch)
1119 {
1120 return L'0' <= wch && wch <= L'7';
1121 }
1122
1123 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1124 {
1125 if ( m_options == MAP_INVALID_UTF8_NOT )
1126 return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
1127
1128 size_t len = 0;
1129
1130 while (*psz && ((!buf) || (len < n)))
1131 {
1132 wxUint32 cc;
1133
1134 #ifdef WC_UTF16
1135 // cast is ok for WC_UTF16
1136 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1137 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1138 #else
1139 cc = (*psz++) & 0x7fffffff;
1140 #endif
1141
1142 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1143 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1144 {
1145 if (buf)
1146 *buf++ = (char)(cc - wxUnicodePUA);
1147 len++;
1148 }
1149 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1150 && cc == L'\\' && psz[0] == L'\\' )
1151 {
1152 if (buf)
1153 *buf++ = (char)cc;
1154 psz++;
1155 len++;
1156 }
1157 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1158 cc == L'\\' &&
1159 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1160 {
1161 if (buf)
1162 {
1163 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1164 (psz[1] - L'0') * 010 +
1165 (psz[2] - L'0'));
1166 }
1167
1168 psz += 3;
1169 len++;
1170 }
1171 else
1172 {
1173 unsigned cnt;
1174 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1175 {
1176 }
1177
1178 if (!cnt)
1179 {
1180 // plain ASCII char
1181 if (buf)
1182 *buf++ = (char) cc;
1183 len++;
1184 }
1185 else
1186 {
1187 len += cnt + 1;
1188 if (buf)
1189 {
1190 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1191 while (cnt--)
1192 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1193 }
1194 }
1195 }
1196 }
1197
1198 if (buf && (len < n))
1199 *buf = 0;
1200
1201 return len;
1202 }
1203
1204 // ============================================================================
1205 // UTF-16
1206 // ============================================================================
1207
1208 #ifdef WORDS_BIGENDIAN
1209 #define wxMBConvUTF16straight wxMBConvUTF16BE
1210 #define wxMBConvUTF16swap wxMBConvUTF16LE
1211 #else
1212 #define wxMBConvUTF16swap wxMBConvUTF16BE
1213 #define wxMBConvUTF16straight wxMBConvUTF16LE
1214 #endif
1215
1216 /* static */
1217 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1218 {
1219 if ( srcLen == wxNO_LEN )
1220 {
1221 // count the number of bytes in input, including the trailing NULs
1222 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1223 for ( srcLen = 1; *inBuff++; srcLen++ )
1224 ;
1225
1226 srcLen *= BYTES_PER_CHAR;
1227 }
1228 else // we already have the length
1229 {
1230 // we can only convert an entire number of UTF-16 characters
1231 if ( srcLen % BYTES_PER_CHAR )
1232 return wxCONV_FAILED;
1233 }
1234
1235 return srcLen;
1236 }
1237
1238 // case when in-memory representation is UTF-16 too
1239 #ifdef WC_UTF16
1240
1241 // ----------------------------------------------------------------------------
1242 // conversions without endianness change
1243 // ----------------------------------------------------------------------------
1244
1245 size_t
1246 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1247 const char *src, size_t srcLen) const
1248 {
1249 // set up the scene for using memcpy() (which is presumably more efficient
1250 // than copying the bytes one by one)
1251 srcLen = GetLength(src, srcLen);
1252 if ( srcLen == wxNO_LEN )
1253 return wxCONV_FAILED;
1254
1255 const size_t inLen = srcLen / BYTES_PER_CHAR;
1256 if ( dst )
1257 {
1258 if ( dstLen < inLen )
1259 return wxCONV_FAILED;
1260
1261 memcpy(dst, src, srcLen);
1262 }
1263
1264 return inLen;
1265 }
1266
1267 size_t
1268 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1269 const wchar_t *src, size_t srcLen) const
1270 {
1271 if ( srcLen == wxNO_LEN )
1272 srcLen = wxWcslen(src) + 1;
1273
1274 srcLen *= BYTES_PER_CHAR;
1275
1276 if ( dst )
1277 {
1278 if ( dstLen < srcLen )
1279 return wxCONV_FAILED;
1280
1281 memcpy(dst, src, srcLen);
1282 }
1283
1284 return srcLen;
1285 }
1286
1287 // ----------------------------------------------------------------------------
1288 // endian-reversing conversions
1289 // ----------------------------------------------------------------------------
1290
1291 size_t
1292 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1293 const char *src, size_t srcLen) const
1294 {
1295 srcLen = GetLength(src, srcLen);
1296 if ( srcLen == wxNO_LEN )
1297 return wxCONV_FAILED;
1298
1299 srcLen /= BYTES_PER_CHAR;
1300
1301 if ( dst )
1302 {
1303 if ( dstLen < srcLen )
1304 return wxCONV_FAILED;
1305
1306 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1307 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1308 {
1309 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1310 }
1311 }
1312
1313 return srcLen;
1314 }
1315
1316 size_t
1317 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1318 const wchar_t *src, size_t srcLen) const
1319 {
1320 if ( srcLen == wxNO_LEN )
1321 srcLen = wxWcslen(src) + 1;
1322
1323 srcLen *= BYTES_PER_CHAR;
1324
1325 if ( dst )
1326 {
1327 if ( dstLen < srcLen )
1328 return wxCONV_FAILED;
1329
1330 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1331 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1332 {
1333 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1334 }
1335 }
1336
1337 return srcLen;
1338 }
1339
1340 #else // !WC_UTF16: wchar_t is UTF-32
1341
1342 // ----------------------------------------------------------------------------
1343 // conversions without endianness change
1344 // ----------------------------------------------------------------------------
1345
1346 size_t
1347 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1348 const char *src, size_t srcLen) const
1349 {
1350 srcLen = GetLength(src, srcLen);
1351 if ( srcLen == wxNO_LEN )
1352 return wxCONV_FAILED;
1353
1354 const size_t inLen = srcLen / BYTES_PER_CHAR;
1355 if ( !dst )
1356 {
1357 // optimization: return maximal space which could be needed for this
1358 // string even if the real size could be smaller if the buffer contains
1359 // any surrogates
1360 return inLen;
1361 }
1362
1363 size_t outLen = 0;
1364 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1365 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1366 {
1367 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1368 if ( !inBuff )
1369 return wxCONV_FAILED;
1370
1371 if ( ++outLen > dstLen )
1372 return wxCONV_FAILED;
1373
1374 *dst++ = ch;
1375 }
1376
1377
1378 return outLen;
1379 }
1380
1381 size_t
1382 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1383 const wchar_t *src, size_t srcLen) const
1384 {
1385 if ( srcLen == wxNO_LEN )
1386 srcLen = wxWcslen(src) + 1;
1387
1388 size_t outLen = 0;
1389 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1390 for ( size_t n = 0; n < srcLen; n++ )
1391 {
1392 wxUint16 cc[2];
1393 const size_t numChars = encode_utf16(*src++, cc);
1394 if ( numChars == wxCONV_FAILED )
1395 return wxCONV_FAILED;
1396
1397 outLen += numChars * BYTES_PER_CHAR;
1398 if ( outBuff )
1399 {
1400 if ( outLen > dstLen )
1401 return wxCONV_FAILED;
1402
1403 *outBuff++ = cc[0];
1404 if ( numChars == 2 )
1405 {
1406 // second character of a surrogate
1407 *outBuff++ = cc[1];
1408 }
1409 }
1410 }
1411
1412 return outLen;
1413 }
1414
1415 // ----------------------------------------------------------------------------
1416 // endian-reversing conversions
1417 // ----------------------------------------------------------------------------
1418
1419 size_t
1420 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1421 const char *src, size_t srcLen) const
1422 {
1423 srcLen = GetLength(src, srcLen);
1424 if ( srcLen == wxNO_LEN )
1425 return wxCONV_FAILED;
1426
1427 const size_t inLen = srcLen / BYTES_PER_CHAR;
1428 if ( !dst )
1429 {
1430 // optimization: return maximal space which could be needed for this
1431 // string even if the real size could be smaller if the buffer contains
1432 // any surrogates
1433 return inLen;
1434 }
1435
1436 size_t outLen = 0;
1437 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1438 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1439 {
1440 wxUint32 ch;
1441 wxUint16 tmp[2];
1442
1443 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1444 inBuff++;
1445 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1446
1447 const size_t numChars = decode_utf16(tmp, ch);
1448 if ( numChars == wxCONV_FAILED )
1449 return wxCONV_FAILED;
1450
1451 if ( numChars == 2 )
1452 inBuff++;
1453
1454 if ( ++outLen > dstLen )
1455 return wxCONV_FAILED;
1456
1457 *dst++ = ch;
1458 }
1459
1460
1461 return outLen;
1462 }
1463
1464 size_t
1465 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1466 const wchar_t *src, size_t srcLen) const
1467 {
1468 if ( srcLen == wxNO_LEN )
1469 srcLen = wxWcslen(src) + 1;
1470
1471 size_t outLen = 0;
1472 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1473 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1474 {
1475 wxUint16 cc[2];
1476 const size_t numChars = encode_utf16(*src, cc);
1477 if ( numChars == wxCONV_FAILED )
1478 return wxCONV_FAILED;
1479
1480 outLen += numChars * BYTES_PER_CHAR;
1481 if ( outBuff )
1482 {
1483 if ( outLen > dstLen )
1484 return wxCONV_FAILED;
1485
1486 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1487 if ( numChars == 2 )
1488 {
1489 // second character of a surrogate
1490 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1491 }
1492 }
1493 }
1494
1495 return outLen;
1496 }
1497
1498 #endif // WC_UTF16/!WC_UTF16
1499
1500
1501 // ============================================================================
1502 // UTF-32
1503 // ============================================================================
1504
1505 #ifdef WORDS_BIGENDIAN
1506 #define wxMBConvUTF32straight wxMBConvUTF32BE
1507 #define wxMBConvUTF32swap wxMBConvUTF32LE
1508 #else
1509 #define wxMBConvUTF32swap wxMBConvUTF32BE
1510 #define wxMBConvUTF32straight wxMBConvUTF32LE
1511 #endif
1512
1513
1514 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1515 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1516
1517 /* static */
1518 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1519 {
1520 if ( srcLen == wxNO_LEN )
1521 {
1522 // count the number of bytes in input, including the trailing NULs
1523 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1524 for ( srcLen = 1; *inBuff++; srcLen++ )
1525 ;
1526
1527 srcLen *= BYTES_PER_CHAR;
1528 }
1529 else // we already have the length
1530 {
1531 // we can only convert an entire number of UTF-32 characters
1532 if ( srcLen % BYTES_PER_CHAR )
1533 return wxCONV_FAILED;
1534 }
1535
1536 return srcLen;
1537 }
1538
1539 // case when in-memory representation is UTF-16
1540 #ifdef WC_UTF16
1541
1542 // ----------------------------------------------------------------------------
1543 // conversions without endianness change
1544 // ----------------------------------------------------------------------------
1545
1546 size_t
1547 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1548 const char *src, size_t srcLen) const
1549 {
1550 srcLen = GetLength(src, srcLen);
1551 if ( srcLen == wxNO_LEN )
1552 return wxCONV_FAILED;
1553
1554 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1555 const size_t inLen = srcLen / BYTES_PER_CHAR;
1556 size_t outLen = 0;
1557 for ( size_t n = 0; n < inLen; n++ )
1558 {
1559 wxUint16 cc[2];
1560 const size_t numChars = encode_utf16(*inBuff++, cc);
1561 if ( numChars == wxCONV_FAILED )
1562 return wxCONV_FAILED;
1563
1564 outLen += numChars;
1565 if ( dst )
1566 {
1567 if ( outLen > dstLen )
1568 return wxCONV_FAILED;
1569
1570 *dst++ = cc[0];
1571 if ( numChars == 2 )
1572 {
1573 // second character of a surrogate
1574 *dst++ = cc[1];
1575 }
1576 }
1577 }
1578
1579 return outLen;
1580 }
1581
1582 size_t
1583 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1584 const wchar_t *src, size_t srcLen) const
1585 {
1586 if ( srcLen == wxNO_LEN )
1587 srcLen = wxWcslen(src) + 1;
1588
1589 if ( !dst )
1590 {
1591 // optimization: return maximal space which could be needed for this
1592 // string instead of the exact amount which could be less if there are
1593 // any surrogates in the input
1594 //
1595 // we consider that surrogates are rare enough to make it worthwhile to
1596 // avoid running the loop below at the cost of slightly extra memory
1597 // consumption
1598 return srcLen * BYTES_PER_CHAR;
1599 }
1600
1601 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1602 size_t outLen = 0;
1603 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1604 {
1605 const wxUint32 ch = wxDecodeSurrogate(&src);
1606 if ( !src )
1607 return wxCONV_FAILED;
1608
1609 outLen += BYTES_PER_CHAR;
1610
1611 if ( outLen > dstLen )
1612 return wxCONV_FAILED;
1613
1614 *outBuff++ = ch;
1615 }
1616
1617 return outLen;
1618 }
1619
1620 // ----------------------------------------------------------------------------
1621 // endian-reversing conversions
1622 // ----------------------------------------------------------------------------
1623
1624 size_t
1625 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1626 const char *src, size_t srcLen) const
1627 {
1628 srcLen = GetLength(src, srcLen);
1629 if ( srcLen == wxNO_LEN )
1630 return wxCONV_FAILED;
1631
1632 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1633 const size_t inLen = srcLen / BYTES_PER_CHAR;
1634 size_t outLen = 0;
1635 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1636 {
1637 wxUint16 cc[2];
1638 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1639 if ( numChars == wxCONV_FAILED )
1640 return wxCONV_FAILED;
1641
1642 outLen += numChars;
1643 if ( dst )
1644 {
1645 if ( outLen > dstLen )
1646 return wxCONV_FAILED;
1647
1648 *dst++ = cc[0];
1649 if ( numChars == 2 )
1650 {
1651 // second character of a surrogate
1652 *dst++ = cc[1];
1653 }
1654 }
1655 }
1656
1657 return outLen;
1658 }
1659
1660 size_t
1661 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1662 const wchar_t *src, size_t srcLen) const
1663 {
1664 if ( srcLen == wxNO_LEN )
1665 srcLen = wxWcslen(src) + 1;
1666
1667 if ( !dst )
1668 {
1669 // optimization: return maximal space which could be needed for this
1670 // string instead of the exact amount which could be less if there are
1671 // any surrogates in the input
1672 //
1673 // we consider that surrogates are rare enough to make it worthwhile to
1674 // avoid running the loop below at the cost of slightly extra memory
1675 // consumption
1676 return srcLen*BYTES_PER_CHAR;
1677 }
1678
1679 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1680 size_t outLen = 0;
1681 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1682 {
1683 const wxUint32 ch = wxDecodeSurrogate(&src);
1684 if ( !src )
1685 return wxCONV_FAILED;
1686
1687 outLen += BYTES_PER_CHAR;
1688
1689 if ( outLen > dstLen )
1690 return wxCONV_FAILED;
1691
1692 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1693 }
1694
1695 return outLen;
1696 }
1697
1698 #else // !WC_UTF16: wchar_t is UTF-32
1699
1700 // ----------------------------------------------------------------------------
1701 // conversions without endianness change
1702 // ----------------------------------------------------------------------------
1703
1704 size_t
1705 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1706 const char *src, size_t srcLen) const
1707 {
1708 // use memcpy() as it should be much faster than hand-written loop
1709 srcLen = GetLength(src, srcLen);
1710 if ( srcLen == wxNO_LEN )
1711 return wxCONV_FAILED;
1712
1713 const size_t inLen = srcLen/BYTES_PER_CHAR;
1714 if ( dst )
1715 {
1716 if ( dstLen < inLen )
1717 return wxCONV_FAILED;
1718
1719 memcpy(dst, src, srcLen);
1720 }
1721
1722 return inLen;
1723 }
1724
1725 size_t
1726 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1727 const wchar_t *src, size_t srcLen) const
1728 {
1729 if ( srcLen == wxNO_LEN )
1730 srcLen = wxWcslen(src) + 1;
1731
1732 srcLen *= BYTES_PER_CHAR;
1733
1734 if ( dst )
1735 {
1736 if ( dstLen < srcLen )
1737 return wxCONV_FAILED;
1738
1739 memcpy(dst, src, srcLen);
1740 }
1741
1742 return srcLen;
1743 }
1744
1745 // ----------------------------------------------------------------------------
1746 // endian-reversing conversions
1747 // ----------------------------------------------------------------------------
1748
1749 size_t
1750 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1751 const char *src, size_t srcLen) const
1752 {
1753 srcLen = GetLength(src, srcLen);
1754 if ( srcLen == wxNO_LEN )
1755 return wxCONV_FAILED;
1756
1757 srcLen /= BYTES_PER_CHAR;
1758
1759 if ( dst )
1760 {
1761 if ( dstLen < srcLen )
1762 return wxCONV_FAILED;
1763
1764 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1765 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1766 {
1767 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1768 }
1769 }
1770
1771 return srcLen;
1772 }
1773
1774 size_t
1775 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1776 const wchar_t *src, size_t srcLen) const
1777 {
1778 if ( srcLen == wxNO_LEN )
1779 srcLen = wxWcslen(src) + 1;
1780
1781 srcLen *= BYTES_PER_CHAR;
1782
1783 if ( dst )
1784 {
1785 if ( dstLen < srcLen )
1786 return wxCONV_FAILED;
1787
1788 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1789 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1790 {
1791 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1792 }
1793 }
1794
1795 return srcLen;
1796 }
1797
1798 #endif // WC_UTF16/!WC_UTF16
1799
1800
1801 // ============================================================================
1802 // The classes doing conversion using the iconv_xxx() functions
1803 // ============================================================================
1804
1805 #ifdef HAVE_ICONV
1806
1807 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1808 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1809 // (unless there's yet another bug in glibc) the only case when iconv()
1810 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1811 // left in the input buffer -- when _real_ error occurs,
1812 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1813 // iconv() failure.
1814 // [This bug does not appear in glibc 2.2.]
1815 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1816 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1817 (errno != E2BIG || bufLeft != 0))
1818 #else
1819 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1820 #endif
1821
1822 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1823
1824 #define ICONV_T_INVALID ((iconv_t)-1)
1825
1826 #if SIZEOF_WCHAR_T == 4
1827 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1828 #define WC_ENC wxFONTENCODING_UTF32
1829 #elif SIZEOF_WCHAR_T == 2
1830 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1831 #define WC_ENC wxFONTENCODING_UTF16
1832 #else // sizeof(wchar_t) != 2 nor 4
1833 // does this ever happen?
1834 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1835 #endif
1836
1837 // ----------------------------------------------------------------------------
1838 // wxMBConv_iconv: encapsulates an iconv character set
1839 // ----------------------------------------------------------------------------
1840
1841 class wxMBConv_iconv : public wxMBConv
1842 {
1843 public:
1844 wxMBConv_iconv(const char *name);
1845 virtual ~wxMBConv_iconv();
1846
1847 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1848 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1849
1850 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1851 virtual size_t GetMBNulLen() const;
1852
1853 #if wxUSE_UNICODE_UTF8
1854 virtual bool IsUTF8() const;
1855 #endif
1856
1857 virtual wxMBConv *Clone() const
1858 {
1859 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1860 p->m_minMBCharWidth = m_minMBCharWidth;
1861 return p;
1862 }
1863
1864 bool IsOk() const
1865 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1866
1867 protected:
1868 // the iconv handlers used to translate from multibyte
1869 // to wide char and in the other direction
1870 iconv_t m2w,
1871 w2m;
1872
1873 #if wxUSE_THREADS
1874 // guards access to m2w and w2m objects
1875 wxMutex m_iconvMutex;
1876 #endif
1877
1878 private:
1879 // the name (for iconv_open()) of a wide char charset -- if none is
1880 // available on this machine, it will remain NULL
1881 static wxString ms_wcCharsetName;
1882
1883 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1884 // different endian-ness than the native one
1885 static bool ms_wcNeedsSwap;
1886
1887
1888 // name of the encoding handled by this conversion
1889 wxString m_name;
1890
1891 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1892 // initially
1893 size_t m_minMBCharWidth;
1894 };
1895
1896 // make the constructor available for unit testing
1897 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1898 {
1899 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1900 if ( !result->IsOk() )
1901 {
1902 delete result;
1903 return 0;
1904 }
1905
1906 return result;
1907 }
1908
1909 wxString wxMBConv_iconv::ms_wcCharsetName;
1910 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1911
1912 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1913 : m_name(name)
1914 {
1915 m_minMBCharWidth = 0;
1916
1917 // check for charset that represents wchar_t:
1918 if ( ms_wcCharsetName.empty() )
1919 {
1920 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1921
1922 #if wxUSE_FONTMAP
1923 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1924 #else // !wxUSE_FONTMAP
1925 static const wxChar *names_static[] =
1926 {
1927 #if SIZEOF_WCHAR_T == 4
1928 _T("UCS-4"),
1929 #elif SIZEOF_WCHAR_T = 2
1930 _T("UCS-2"),
1931 #endif
1932 NULL
1933 };
1934 const wxChar **names = names_static;
1935 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1936
1937 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1938 {
1939 const wxString nameCS(*names);
1940
1941 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1942 wxString nameXE(nameCS);
1943
1944 #ifdef WORDS_BIGENDIAN
1945 nameXE += _T("BE");
1946 #else // little endian
1947 nameXE += _T("LE");
1948 #endif
1949
1950 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1951 nameXE.c_str());
1952
1953 m2w = iconv_open(nameXE.ToAscii(), name);
1954 if ( m2w == ICONV_T_INVALID )
1955 {
1956 // try charset w/o bytesex info (e.g. "UCS4")
1957 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1958 nameCS.c_str());
1959 m2w = iconv_open(nameCS.ToAscii(), name);
1960
1961 // and check for bytesex ourselves:
1962 if ( m2w != ICONV_T_INVALID )
1963 {
1964 char buf[2], *bufPtr;
1965 wchar_t wbuf[2], *wbufPtr;
1966 size_t insz, outsz;
1967 size_t res;
1968
1969 buf[0] = 'A';
1970 buf[1] = 0;
1971 wbuf[0] = 0;
1972 insz = 2;
1973 outsz = SIZEOF_WCHAR_T * 2;
1974 wbufPtr = wbuf;
1975 bufPtr = buf;
1976
1977 res = iconv(
1978 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1979 (char**)&wbufPtr, &outsz);
1980
1981 if (ICONV_FAILED(res, insz))
1982 {
1983 wxLogLastError(wxT("iconv"));
1984 wxLogError(_("Conversion to charset '%s' doesn't work."),
1985 nameCS.c_str());
1986 }
1987 else // ok, can convert to this encoding, remember it
1988 {
1989 ms_wcCharsetName = nameCS;
1990 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1991 }
1992 }
1993 }
1994 else // use charset not requiring byte swapping
1995 {
1996 ms_wcCharsetName = nameXE;
1997 }
1998 }
1999
2000 wxLogTrace(TRACE_STRCONV,
2001 wxT("iconv wchar_t charset is \"%s\"%s"),
2002 ms_wcCharsetName.empty() ? wxString("<none>")
2003 : ms_wcCharsetName,
2004 ms_wcNeedsSwap ? _T(" (needs swap)")
2005 : _T(""));
2006 }
2007 else // we already have ms_wcCharsetName
2008 {
2009 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2010 }
2011
2012 if ( ms_wcCharsetName.empty() )
2013 {
2014 w2m = ICONV_T_INVALID;
2015 }
2016 else
2017 {
2018 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2019 if ( w2m == ICONV_T_INVALID )
2020 {
2021 wxLogTrace(TRACE_STRCONV,
2022 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2023 ms_wcCharsetName.c_str(), name);
2024 }
2025 }
2026 }
2027
2028 wxMBConv_iconv::~wxMBConv_iconv()
2029 {
2030 if ( m2w != ICONV_T_INVALID )
2031 iconv_close(m2w);
2032 if ( w2m != ICONV_T_INVALID )
2033 iconv_close(w2m);
2034 }
2035
2036 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2037 {
2038 // find the string length: notice that must be done differently for
2039 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2040 size_t inbuf;
2041 const size_t nulLen = GetMBNulLen();
2042 switch ( nulLen )
2043 {
2044 default:
2045 return wxCONV_FAILED;
2046
2047 case 1:
2048 inbuf = strlen(psz); // arguably more optimized than our version
2049 break;
2050
2051 case 2:
2052 case 4:
2053 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2054 // they also have to start at character boundary and not span two
2055 // adjacent characters
2056 const char *p;
2057 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2058 ;
2059 inbuf = p - psz;
2060 break;
2061 }
2062
2063 #if wxUSE_THREADS
2064 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2065 // Unfortunately there are a couple of global wxCSConv objects such as
2066 // wxConvLocal that are used all over wx code, so we have to make sure
2067 // the handle is used by at most one thread at the time. Otherwise
2068 // only a few wx classes would be safe to use from non-main threads
2069 // as MB<->WC conversion would fail "randomly".
2070 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2071 #endif // wxUSE_THREADS
2072
2073 size_t outbuf = n * SIZEOF_WCHAR_T;
2074 size_t res, cres;
2075 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2076 wchar_t *bufPtr = buf;
2077 const char *pszPtr = psz;
2078
2079 if (buf)
2080 {
2081 // have destination buffer, convert there
2082 cres = iconv(m2w,
2083 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2084 (char**)&bufPtr, &outbuf);
2085 res = n - (outbuf / SIZEOF_WCHAR_T);
2086
2087 if (ms_wcNeedsSwap)
2088 {
2089 // convert to native endianness
2090 for ( unsigned i = 0; i < res; i++ )
2091 buf[n] = WC_BSWAP(buf[i]);
2092 }
2093
2094 // NUL-terminate the string if there is any space left
2095 if (res < n)
2096 buf[res] = 0;
2097 }
2098 else
2099 {
2100 // no destination buffer... convert using temp buffer
2101 // to calculate destination buffer requirement
2102 wchar_t tbuf[8];
2103 res = 0;
2104
2105 do
2106 {
2107 bufPtr = tbuf;
2108 outbuf = 8 * SIZEOF_WCHAR_T;
2109
2110 cres = iconv(m2w,
2111 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2112 (char**)&bufPtr, &outbuf );
2113
2114 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2115 }
2116 while ((cres == (size_t)-1) && (errno == E2BIG));
2117 }
2118
2119 if (ICONV_FAILED(cres, inbuf))
2120 {
2121 //VS: it is ok if iconv fails, hence trace only
2122 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2123 return wxCONV_FAILED;
2124 }
2125
2126 return res;
2127 }
2128
2129 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2130 {
2131 #if wxUSE_THREADS
2132 // NB: explained in MB2WC
2133 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2134 #endif
2135
2136 size_t inlen = wxWcslen(psz);
2137 size_t inbuf = inlen * SIZEOF_WCHAR_T;
2138 size_t outbuf = n;
2139 size_t res, cres;
2140
2141 wchar_t *tmpbuf = 0;
2142
2143 if (ms_wcNeedsSwap)
2144 {
2145 // need to copy to temp buffer to switch endianness
2146 // (doing WC_BSWAP twice on the original buffer won't help, as it
2147 // could be in read-only memory, or be accessed in some other thread)
2148 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2149 for ( size_t i = 0; i < inlen; i++ )
2150 tmpbuf[n] = WC_BSWAP(psz[i]);
2151
2152 tmpbuf[inlen] = L'\0';
2153 psz = tmpbuf;
2154 }
2155
2156 if (buf)
2157 {
2158 // have destination buffer, convert there
2159 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2160
2161 res = n - outbuf;
2162
2163 // NB: iconv was given only wcslen(psz) characters on input, and so
2164 // it couldn't convert the trailing zero. Let's do it ourselves
2165 // if there's some room left for it in the output buffer.
2166 if (res < n)
2167 buf[0] = 0;
2168 }
2169 else
2170 {
2171 // no destination buffer: convert using temp buffer
2172 // to calculate destination buffer requirement
2173 char tbuf[16];
2174 res = 0;
2175 do
2176 {
2177 buf = tbuf;
2178 outbuf = 16;
2179
2180 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2181
2182 res += 16 - outbuf;
2183 }
2184 while ((cres == (size_t)-1) && (errno == E2BIG));
2185 }
2186
2187 if (ms_wcNeedsSwap)
2188 {
2189 free(tmpbuf);
2190 }
2191
2192 if (ICONV_FAILED(cres, inbuf))
2193 {
2194 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2195 return wxCONV_FAILED;
2196 }
2197
2198 return res;
2199 }
2200
2201 size_t wxMBConv_iconv::GetMBNulLen() const
2202 {
2203 if ( m_minMBCharWidth == 0 )
2204 {
2205 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2206
2207 #if wxUSE_THREADS
2208 // NB: explained in MB2WC
2209 wxMutexLocker lock(self->m_iconvMutex);
2210 #endif
2211
2212 const wchar_t *wnul = L"";
2213 char buf[8]; // should be enough for NUL in any encoding
2214 size_t inLen = sizeof(wchar_t),
2215 outLen = WXSIZEOF(buf);
2216 char *inBuff = (char *)wnul;
2217 char *outBuff = buf;
2218 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2219 {
2220 self->m_minMBCharWidth = (size_t)-1;
2221 }
2222 else // ok
2223 {
2224 self->m_minMBCharWidth = outBuff - buf;
2225 }
2226 }
2227
2228 return m_minMBCharWidth;
2229 }
2230
2231 #if wxUSE_UNICODE_UTF8
2232 bool wxMBConv_iconv::IsUTF8() const
2233 {
2234 return wxStricmp(m_name, "UTF-8") == 0 ||
2235 wxStricmp(m_name, "UTF8") == 0;
2236 }
2237 #endif
2238
2239 #endif // HAVE_ICONV
2240
2241
2242 // ============================================================================
2243 // Win32 conversion classes
2244 // ============================================================================
2245
2246 #ifdef wxHAVE_WIN32_MB2WC
2247
2248 // from utils.cpp
2249 #if wxUSE_FONTMAP
2250 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2251 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2252 #endif
2253
2254 class wxMBConv_win32 : public wxMBConv
2255 {
2256 public:
2257 wxMBConv_win32()
2258 {
2259 m_CodePage = CP_ACP;
2260 m_minMBCharWidth = 0;
2261 }
2262
2263 wxMBConv_win32(const wxMBConv_win32& conv)
2264 : wxMBConv()
2265 {
2266 m_CodePage = conv.m_CodePage;
2267 m_minMBCharWidth = conv.m_minMBCharWidth;
2268 }
2269
2270 #if wxUSE_FONTMAP
2271 wxMBConv_win32(const char* name)
2272 {
2273 m_CodePage = wxCharsetToCodepage(name);
2274 m_minMBCharWidth = 0;
2275 }
2276
2277 wxMBConv_win32(wxFontEncoding encoding)
2278 {
2279 m_CodePage = wxEncodingToCodepage(encoding);
2280 m_minMBCharWidth = 0;
2281 }
2282 #endif // wxUSE_FONTMAP
2283
2284 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2285 {
2286 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2287 // the behaviour is not compatible with the Unix version (using iconv)
2288 // and break the library itself, e.g. wxTextInputStream::NextChar()
2289 // wouldn't work if reading an incomplete MB char didn't result in an
2290 // error
2291 //
2292 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2293 // Win XP or newer and it is not supported for UTF-[78] so we always
2294 // use our own conversions in this case. See
2295 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2296 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2297 if ( m_CodePage == CP_UTF8 )
2298 {
2299 return wxMBConvUTF8().MB2WC(buf, psz, n);
2300 }
2301
2302 if ( m_CodePage == CP_UTF7 )
2303 {
2304 return wxMBConvUTF7().MB2WC(buf, psz, n);
2305 }
2306
2307 int flags = 0;
2308 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2309 IsAtLeastWin2kSP4() )
2310 {
2311 flags = MB_ERR_INVALID_CHARS;
2312 }
2313
2314 const size_t len = ::MultiByteToWideChar
2315 (
2316 m_CodePage, // code page
2317 flags, // flags: fall on error
2318 psz, // input string
2319 -1, // its length (NUL-terminated)
2320 buf, // output string
2321 buf ? n : 0 // size of output buffer
2322 );
2323 if ( !len )
2324 {
2325 // function totally failed
2326 return wxCONV_FAILED;
2327 }
2328
2329 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2330 // check if we succeeded, by doing a double trip:
2331 if ( !flags && buf )
2332 {
2333 const size_t mbLen = strlen(psz);
2334 wxCharBuffer mbBuf(mbLen);
2335 if ( ::WideCharToMultiByte
2336 (
2337 m_CodePage,
2338 0,
2339 buf,
2340 -1,
2341 mbBuf.data(),
2342 mbLen + 1, // size in bytes, not length
2343 NULL,
2344 NULL
2345 ) == 0 ||
2346 strcmp(mbBuf, psz) != 0 )
2347 {
2348 // we didn't obtain the same thing we started from, hence
2349 // the conversion was lossy and we consider that it failed
2350 return wxCONV_FAILED;
2351 }
2352 }
2353
2354 // note that it returns count of written chars for buf != NULL and size
2355 // of the needed buffer for buf == NULL so in either case the length of
2356 // the string (which never includes the terminating NUL) is one less
2357 return len - 1;
2358 }
2359
2360 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2361 {
2362 /*
2363 we have a problem here: by default, WideCharToMultiByte() may
2364 replace characters unrepresentable in the target code page with bad
2365 quality approximations such as turning "1/2" symbol (U+00BD) into
2366 "1" for the code pages which don't have it and we, obviously, want
2367 to avoid this at any price
2368
2369 the trouble is that this function does it _silently_, i.e. it won't
2370 even tell us whether it did or not... Win98/2000 and higher provide
2371 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2372 we have to resort to a round trip, i.e. check that converting back
2373 results in the same string -- this is, of course, expensive but
2374 otherwise we simply can't be sure to not garble the data.
2375 */
2376
2377 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2378 // it doesn't work with CJK encodings (which we test for rather roughly
2379 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2380 // supporting it
2381 BOOL usedDef wxDUMMY_INITIALIZE(false);
2382 BOOL *pUsedDef;
2383 int flags;
2384 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2385 {
2386 // it's our lucky day
2387 flags = WC_NO_BEST_FIT_CHARS;
2388 pUsedDef = &usedDef;
2389 }
2390 else // old system or unsupported encoding
2391 {
2392 flags = 0;
2393 pUsedDef = NULL;
2394 }
2395
2396 const size_t len = ::WideCharToMultiByte
2397 (
2398 m_CodePage, // code page
2399 flags, // either none or no best fit
2400 pwz, // input string
2401 -1, // it is (wide) NUL-terminated
2402 buf, // output buffer
2403 buf ? n : 0, // and its size
2404 NULL, // default "replacement" char
2405 pUsedDef // [out] was it used?
2406 );
2407
2408 if ( !len )
2409 {
2410 // function totally failed
2411 return wxCONV_FAILED;
2412 }
2413
2414 // if we were really converting, check if we succeeded
2415 if ( buf )
2416 {
2417 if ( flags )
2418 {
2419 // check if the conversion failed, i.e. if any replacements
2420 // were done
2421 if ( usedDef )
2422 return wxCONV_FAILED;
2423 }
2424 else // we must resort to double tripping...
2425 {
2426 wxWCharBuffer wcBuf(n);
2427 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2428 wcscmp(wcBuf, pwz) != 0 )
2429 {
2430 // we didn't obtain the same thing we started from, hence
2431 // the conversion was lossy and we consider that it failed
2432 return wxCONV_FAILED;
2433 }
2434 }
2435 }
2436
2437 // see the comment above for the reason of "len - 1"
2438 return len - 1;
2439 }
2440
2441 virtual size_t GetMBNulLen() const
2442 {
2443 if ( m_minMBCharWidth == 0 )
2444 {
2445 int len = ::WideCharToMultiByte
2446 (
2447 m_CodePage, // code page
2448 0, // no flags
2449 L"", // input string
2450 1, // translate just the NUL
2451 NULL, // output buffer
2452 0, // and its size
2453 NULL, // no replacement char
2454 NULL // [out] don't care if it was used
2455 );
2456
2457 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2458 switch ( len )
2459 {
2460 default:
2461 wxLogDebug(_T("Unexpected NUL length %d"), len);
2462 self->m_minMBCharWidth = (size_t)-1;
2463 break;
2464
2465 case 0:
2466 self->m_minMBCharWidth = (size_t)-1;
2467 break;
2468
2469 case 1:
2470 case 2:
2471 case 4:
2472 self->m_minMBCharWidth = len;
2473 break;
2474 }
2475 }
2476
2477 return m_minMBCharWidth;
2478 }
2479
2480 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2481
2482 bool IsOk() const { return m_CodePage != -1; }
2483
2484 private:
2485 static bool CanUseNoBestFit()
2486 {
2487 static int s_isWin98Or2k = -1;
2488
2489 if ( s_isWin98Or2k == -1 )
2490 {
2491 int verMaj, verMin;
2492 switch ( wxGetOsVersion(&verMaj, &verMin) )
2493 {
2494 case wxOS_WINDOWS_9X:
2495 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2496 break;
2497
2498 case wxOS_WINDOWS_NT:
2499 s_isWin98Or2k = verMaj >= 5;
2500 break;
2501
2502 default:
2503 // unknown: be conservative by default
2504 s_isWin98Or2k = 0;
2505 break;
2506 }
2507
2508 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2509 }
2510
2511 return s_isWin98Or2k == 1;
2512 }
2513
2514 static bool IsAtLeastWin2kSP4()
2515 {
2516 #ifdef __WXWINCE__
2517 return false;
2518 #else
2519 static int s_isAtLeastWin2kSP4 = -1;
2520
2521 if ( s_isAtLeastWin2kSP4 == -1 )
2522 {
2523 OSVERSIONINFOEX ver;
2524
2525 memset(&ver, 0, sizeof(ver));
2526 ver.dwOSVersionInfoSize = sizeof(ver);
2527 GetVersionEx((OSVERSIONINFO*)&ver);
2528
2529 s_isAtLeastWin2kSP4 =
2530 ((ver.dwMajorVersion > 5) || // Vista+
2531 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2532 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2533 ver.wServicePackMajor >= 4)) // 2000 SP4+
2534 ? 1 : 0;
2535 }
2536
2537 return s_isAtLeastWin2kSP4 == 1;
2538 #endif
2539 }
2540
2541
2542 // the code page we're working with
2543 long m_CodePage;
2544
2545 // cached result of GetMBNulLen(), set to 0 initially meaning
2546 // "unknown"
2547 size_t m_minMBCharWidth;
2548 };
2549
2550 #endif // wxHAVE_WIN32_MB2WC
2551
2552
2553 // ============================================================================
2554 // wxEncodingConverter based conversion classes
2555 // ============================================================================
2556
2557 #if wxUSE_FONTMAP
2558
2559 class wxMBConv_wxwin : public wxMBConv
2560 {
2561 private:
2562 void Init()
2563 {
2564 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2565 // The wxMBConv_cf class does a better job.
2566 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2567 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2568 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2569 }
2570
2571 public:
2572 // temporarily just use wxEncodingConverter stuff,
2573 // so that it works while a better implementation is built
2574 wxMBConv_wxwin(const char* name)
2575 {
2576 if (name)
2577 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2578 else
2579 m_enc = wxFONTENCODING_SYSTEM;
2580
2581 Init();
2582 }
2583
2584 wxMBConv_wxwin(wxFontEncoding enc)
2585 {
2586 m_enc = enc;
2587
2588 Init();
2589 }
2590
2591 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2592 {
2593 size_t inbuf = strlen(psz);
2594 if (buf)
2595 {
2596 if (!m2w.Convert(psz, buf))
2597 return wxCONV_FAILED;
2598 }
2599 return inbuf;
2600 }
2601
2602 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2603 {
2604 const size_t inbuf = wxWcslen(psz);
2605 if (buf)
2606 {
2607 if (!w2m.Convert(psz, buf))
2608 return wxCONV_FAILED;
2609 }
2610
2611 return inbuf;
2612 }
2613
2614 virtual size_t GetMBNulLen() const
2615 {
2616 switch ( m_enc )
2617 {
2618 case wxFONTENCODING_UTF16BE:
2619 case wxFONTENCODING_UTF16LE:
2620 return 2;
2621
2622 case wxFONTENCODING_UTF32BE:
2623 case wxFONTENCODING_UTF32LE:
2624 return 4;
2625
2626 default:
2627 return 1;
2628 }
2629 }
2630
2631 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2632
2633 bool IsOk() const { return m_ok; }
2634
2635 public:
2636 wxFontEncoding m_enc;
2637 wxEncodingConverter m2w, w2m;
2638
2639 private:
2640 // were we initialized successfully?
2641 bool m_ok;
2642
2643 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2644 };
2645
2646 // make the constructors available for unit testing
2647 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2648 {
2649 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2650 if ( !result->IsOk() )
2651 {
2652 delete result;
2653 return 0;
2654 }
2655
2656 return result;
2657 }
2658
2659 #endif // wxUSE_FONTMAP
2660
2661 // ============================================================================
2662 // wxCSConv implementation
2663 // ============================================================================
2664
2665 void wxCSConv::Init()
2666 {
2667 m_name = NULL;
2668 m_convReal = NULL;
2669 m_deferred = true;
2670 }
2671
2672 wxCSConv::wxCSConv(const wxString& charset)
2673 {
2674 Init();
2675
2676 if ( !charset.empty() )
2677 {
2678 SetName(charset.ToAscii());
2679 }
2680
2681 #if wxUSE_FONTMAP
2682 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2683 #else
2684 m_encoding = wxFONTENCODING_SYSTEM;
2685 #endif
2686 }
2687
2688 wxCSConv::wxCSConv(wxFontEncoding encoding)
2689 {
2690 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2691 {
2692 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2693
2694 encoding = wxFONTENCODING_SYSTEM;
2695 }
2696
2697 Init();
2698
2699 m_encoding = encoding;
2700 }
2701
2702 wxCSConv::~wxCSConv()
2703 {
2704 Clear();
2705 }
2706
2707 wxCSConv::wxCSConv(const wxCSConv& conv)
2708 : wxMBConv()
2709 {
2710 Init();
2711
2712 SetName(conv.m_name);
2713 m_encoding = conv.m_encoding;
2714 }
2715
2716 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2717 {
2718 Clear();
2719
2720 SetName(conv.m_name);
2721 m_encoding = conv.m_encoding;
2722
2723 return *this;
2724 }
2725
2726 void wxCSConv::Clear()
2727 {
2728 free(m_name);
2729 delete m_convReal;
2730
2731 m_name = NULL;
2732 m_convReal = NULL;
2733 }
2734
2735 void wxCSConv::SetName(const char *charset)
2736 {
2737 if (charset)
2738 {
2739 m_name = wxStrdup(charset);
2740 m_deferred = true;
2741 }
2742 }
2743
2744 #if wxUSE_FONTMAP
2745
2746 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2747 wxEncodingNameCache );
2748
2749 static wxEncodingNameCache gs_nameCache;
2750 #endif
2751
2752 wxMBConv *wxCSConv::DoCreate() const
2753 {
2754 #if wxUSE_FONTMAP
2755 wxLogTrace(TRACE_STRCONV,
2756 wxT("creating conversion for %s"),
2757 (m_name ? m_name
2758 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2759 #endif // wxUSE_FONTMAP
2760
2761 // check for the special case of ASCII or ISO8859-1 charset: as we have
2762 // special knowledge of it anyhow, we don't need to create a special
2763 // conversion object
2764 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2765 m_encoding == wxFONTENCODING_DEFAULT )
2766 {
2767 // don't convert at all
2768 return NULL;
2769 }
2770
2771 // we trust OS to do conversion better than we can so try external
2772 // conversion methods first
2773 //
2774 // the full order is:
2775 // 1. OS conversion (iconv() under Unix or Win32 API)
2776 // 2. hard coded conversions for UTF
2777 // 3. wxEncodingConverter as fall back
2778
2779 // step (1)
2780 #ifdef HAVE_ICONV
2781 #if !wxUSE_FONTMAP
2782 if ( m_name )
2783 #endif // !wxUSE_FONTMAP
2784 {
2785 #if wxUSE_FONTMAP
2786 wxFontEncoding encoding(m_encoding);
2787 #endif
2788
2789 if ( m_name )
2790 {
2791 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2792 if ( conv->IsOk() )
2793 return conv;
2794
2795 delete conv;
2796
2797 #if wxUSE_FONTMAP
2798 encoding =
2799 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2800 #endif // wxUSE_FONTMAP
2801 }
2802 #if wxUSE_FONTMAP
2803 {
2804 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2805 if ( it != gs_nameCache.end() )
2806 {
2807 if ( it->second.empty() )
2808 return NULL;
2809
2810 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2811 if ( conv->IsOk() )
2812 return conv;
2813
2814 delete conv;
2815 }
2816
2817 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2818 // CS : in case this does not return valid names (eg for MacRoman)
2819 // encoding got a 'failure' entry in the cache all the same,
2820 // although it just has to be created using a different method, so
2821 // only store failed iconv creation attempts (or perhaps we
2822 // shoulnd't do this at all ?)
2823 if ( names[0] != NULL )
2824 {
2825 for ( ; *names; ++names )
2826 {
2827 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2828 // will need changes that will obsolete this
2829 wxString name(*names);
2830 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2831 if ( conv->IsOk() )
2832 {
2833 gs_nameCache[encoding] = *names;
2834 return conv;
2835 }
2836
2837 delete conv;
2838 }
2839
2840 gs_nameCache[encoding] = _T(""); // cache the failure
2841 }
2842 }
2843 #endif // wxUSE_FONTMAP
2844 }
2845 #endif // HAVE_ICONV
2846
2847 #ifdef wxHAVE_WIN32_MB2WC
2848 {
2849 #if wxUSE_FONTMAP
2850 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2851 : new wxMBConv_win32(m_encoding);
2852 if ( conv->IsOk() )
2853 return conv;
2854
2855 delete conv;
2856 #else
2857 return NULL;
2858 #endif
2859 }
2860 #endif // wxHAVE_WIN32_MB2WC
2861
2862 #ifdef __DARWIN__
2863 {
2864 // leave UTF16 and UTF32 to the built-ins of wx
2865 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2866 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2867 {
2868 #if wxUSE_FONTMAP
2869 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2870 : new wxMBConv_cf(m_encoding);
2871 #else
2872 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2873 #endif
2874
2875 if ( conv->IsOk() )
2876 return conv;
2877
2878 delete conv;
2879 }
2880 }
2881 #endif // __DARWIN__
2882
2883 // step (2)
2884 wxFontEncoding enc = m_encoding;
2885 #if wxUSE_FONTMAP
2886 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2887 {
2888 // use "false" to suppress interactive dialogs -- we can be called from
2889 // anywhere and popping up a dialog from here is the last thing we want to
2890 // do
2891 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2892 }
2893 #endif // wxUSE_FONTMAP
2894
2895 switch ( enc )
2896 {
2897 case wxFONTENCODING_UTF7:
2898 return new wxMBConvUTF7;
2899
2900 case wxFONTENCODING_UTF8:
2901 return new wxMBConvUTF8;
2902
2903 case wxFONTENCODING_UTF16BE:
2904 return new wxMBConvUTF16BE;
2905
2906 case wxFONTENCODING_UTF16LE:
2907 return new wxMBConvUTF16LE;
2908
2909 case wxFONTENCODING_UTF32BE:
2910 return new wxMBConvUTF32BE;
2911
2912 case wxFONTENCODING_UTF32LE:
2913 return new wxMBConvUTF32LE;
2914
2915 default:
2916 // nothing to do but put here to suppress gcc warnings
2917 break;
2918 }
2919
2920 // step (3)
2921 #if wxUSE_FONTMAP
2922 {
2923 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2924 : new wxMBConv_wxwin(m_encoding);
2925 if ( conv->IsOk() )
2926 return conv;
2927
2928 delete conv;
2929 }
2930 #endif // wxUSE_FONTMAP
2931
2932 // NB: This is a hack to prevent deadlock. What could otherwise happen
2933 // in Unicode build: wxConvLocal creation ends up being here
2934 // because of some failure and logs the error. But wxLog will try to
2935 // attach a timestamp, for which it will need wxConvLocal (to convert
2936 // time to char* and then wchar_t*), but that fails, tries to log the
2937 // error, but wxLog has an (already locked) critical section that
2938 // guards the static buffer.
2939 static bool alreadyLoggingError = false;
2940 if (!alreadyLoggingError)
2941 {
2942 alreadyLoggingError = true;
2943 wxLogError(_("Cannot convert from the charset '%s'!"),
2944 m_name ? m_name
2945 :
2946 #if wxUSE_FONTMAP
2947 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2948 #else // !wxUSE_FONTMAP
2949 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2950 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2951 );
2952
2953 alreadyLoggingError = false;
2954 }
2955
2956 return NULL;
2957 }
2958
2959 void wxCSConv::CreateConvIfNeeded() const
2960 {
2961 if ( m_deferred )
2962 {
2963 wxCSConv *self = (wxCSConv *)this; // const_cast
2964
2965 // if we don't have neither the name nor the encoding, use the default
2966 // encoding for this system
2967 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2968 {
2969 #if wxUSE_INTL
2970 self->m_encoding = wxLocale::GetSystemEncoding();
2971 #else
2972 // fallback to some reasonable default:
2973 self->m_encoding = wxFONTENCODING_ISO8859_1;
2974 #endif // wxUSE_INTL
2975 }
2976
2977 self->m_convReal = DoCreate();
2978 self->m_deferred = false;
2979 }
2980 }
2981
2982 bool wxCSConv::IsOk() const
2983 {
2984 CreateConvIfNeeded();
2985
2986 // special case: no convReal created for wxFONTENCODING_ISO8859_1
2987 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2988 return true; // always ok as we do it ourselves
2989
2990 // m_convReal->IsOk() is called at its own creation, so we know it must
2991 // be ok if m_convReal is non-NULL
2992 return m_convReal != NULL;
2993 }
2994
2995 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
2996 const char *src, size_t srcLen) const
2997 {
2998 CreateConvIfNeeded();
2999
3000 if (m_convReal)
3001 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3002
3003 // latin-1 (direct)
3004 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3005 }
3006
3007 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3008 const wchar_t *src, size_t srcLen) const
3009 {
3010 CreateConvIfNeeded();
3011
3012 if (m_convReal)
3013 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3014
3015 // latin-1 (direct)
3016 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3017 }
3018
3019 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3020 {
3021 CreateConvIfNeeded();
3022
3023 if (m_convReal)
3024 return m_convReal->MB2WC(buf, psz, n);
3025
3026 // latin-1 (direct)
3027 size_t len = strlen(psz);
3028
3029 if (buf)
3030 {
3031 for (size_t c = 0; c <= len; c++)
3032 buf[c] = (unsigned char)(psz[c]);
3033 }
3034
3035 return len;
3036 }
3037
3038 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3039 {
3040 CreateConvIfNeeded();
3041
3042 if (m_convReal)
3043 return m_convReal->WC2MB(buf, psz, n);
3044
3045 // latin-1 (direct)
3046 const size_t len = wxWcslen(psz);
3047 if (buf)
3048 {
3049 for (size_t c = 0; c <= len; c++)
3050 {
3051 if (psz[c] > 0xFF)
3052 return wxCONV_FAILED;
3053
3054 buf[c] = (char)psz[c];
3055 }
3056 }
3057 else
3058 {
3059 for (size_t c = 0; c <= len; c++)
3060 {
3061 if (psz[c] > 0xFF)
3062 return wxCONV_FAILED;
3063 }
3064 }
3065
3066 return len;
3067 }
3068
3069 size_t wxCSConv::GetMBNulLen() const
3070 {
3071 CreateConvIfNeeded();
3072
3073 if ( m_convReal )
3074 {
3075 return m_convReal->GetMBNulLen();
3076 }
3077
3078 // otherwise, we are ISO-8859-1
3079 return 1;
3080 }
3081
3082 #if wxUSE_UNICODE_UTF8
3083 bool wxCSConv::IsUTF8() const
3084 {
3085 CreateConvIfNeeded();
3086
3087 if ( m_convReal )
3088 {
3089 return m_convReal->IsUTF8();
3090 }
3091
3092 // otherwise, we are ISO-8859-1
3093 return false;
3094 }
3095 #endif
3096
3097
3098 #if wxUSE_UNICODE
3099
3100 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3101 {
3102 if ( !s )
3103 return wxWCharBuffer();
3104
3105 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3106 if ( !wbuf )
3107 wbuf = wxMBConvUTF8().cMB2WX(s);
3108 if ( !wbuf )
3109 wbuf = wxConvISO8859_1.cMB2WX(s);
3110
3111 return wbuf;
3112 }
3113
3114 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3115 {
3116 if ( !ws )
3117 return wxCharBuffer();
3118
3119 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3120 if ( !buf )
3121 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3122
3123 return buf;
3124 }
3125
3126 #endif // wxUSE_UNICODE
3127
3128 // ----------------------------------------------------------------------------
3129 // globals
3130 // ----------------------------------------------------------------------------
3131
3132 // NB: The reason why we create converted objects in this convoluted way,
3133 // using a factory function instead of global variable, is that they
3134 // may be used at static initialization time (some of them are used by
3135 // wxString ctors and there may be a global wxString object). In other
3136 // words, possibly _before_ the converter global object would be
3137 // initialized.
3138
3139 #undef wxConvLibc
3140 #undef wxConvUTF8
3141 #undef wxConvUTF7
3142 #undef wxConvLocal
3143 #undef wxConvISO8859_1
3144
3145 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3146 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3147 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3148 { \
3149 static impl_klass name##Obj ctor_args; \
3150 return &name##Obj; \
3151 } \
3152 /* this ensures that all global converter objects are created */ \
3153 /* by the time static initialization is done, i.e. before any */ \
3154 /* thread is launched: */ \
3155 static klass* gs_##name##instance = wxGet_##name##Ptr()
3156
3157 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3158 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3159
3160 #ifdef __WINDOWS__
3161 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3162 #else
3163 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3164 #endif
3165
3166 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3167 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3168
3169 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3170 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3171
3172 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3173 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3174
3175 #ifdef __DARWIN__
3176 // The xnu kernel always communicates file paths in decomposed UTF-8.
3177 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3178 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3179 #endif
3180
3181 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3182 #ifdef __DARWIN__
3183 &wxConvMacUTF8DObj;
3184 #else // !__DARWIN__
3185 wxGet_wxConvLibcPtr();
3186 #endif // __DARWIN__/!__DARWIN__
3187
3188 #else // !wxUSE_WCHAR_T
3189
3190 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3191 // stand-ins in absence of wchar_t
3192 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3193 wxConvISO8859_1,
3194 wxConvLocal,
3195 wxConvUTF8;
3196
3197 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T