optimized wxMBConvStringUTF8::ToWchar() for ASCII characters
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef __SALFORDC__
48 #include <clib.h>
49 #endif
50
51 #ifdef HAVE_ICONV
52 #include <iconv.h>
53 #include "wx/thread.h"
54 #endif
55
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
58
59 #ifdef __DARWIN__
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
62
63
64 #define TRACE_STRCONV _T("strconv")
65
66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
67 // be 4 bytes
68 #if SIZEOF_WCHAR_T == 2
69 #define WC_UTF16
70 #endif
71
72
73 // ============================================================================
74 // implementation
75 // ============================================================================
76
77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
78 static bool NotAllNULs(const char *p, size_t n)
79 {
80 while ( n && *p++ == '\0' )
81 n--;
82
83 return n != 0;
84 }
85
86 // ----------------------------------------------------------------------------
87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
88 // ----------------------------------------------------------------------------
89
90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
91 {
92 if (input <= 0xffff)
93 {
94 if (output)
95 *output = (wxUint16) input;
96
97 return 1;
98 }
99 else if (input >= 0x110000)
100 {
101 return wxCONV_FAILED;
102 }
103 else
104 {
105 if (output)
106 {
107 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
108 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
109 }
110
111 return 2;
112 }
113 }
114
115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
116 {
117 if ((*input < 0xd800) || (*input > 0xdfff))
118 {
119 output = *input;
120 return 1;
121 }
122 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
123 {
124 output = *input;
125 return wxCONV_FAILED;
126 }
127 else
128 {
129 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
130 return 2;
131 }
132 }
133
134 #ifdef WC_UTF16
135 typedef wchar_t wxDecodeSurrogate_t;
136 #else // !WC_UTF16
137 typedef wxUint16 wxDecodeSurrogate_t;
138 #endif // WC_UTF16/!WC_UTF16
139
140 // returns the next UTF-32 character from the wchar_t buffer and advances the
141 // pointer to the character after this one
142 //
143 // if an invalid character is found, *pSrc is set to NULL, the caller must
144 // check for this
145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
146 {
147 wxUint32 out;
148 const size_t
149 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
150 if ( n == wxCONV_FAILED )
151 *pSrc = NULL;
152 else
153 *pSrc += n;
154
155 return out;
156 }
157
158 // ----------------------------------------------------------------------------
159 // wxMBConv
160 // ----------------------------------------------------------------------------
161
162 size_t
163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
164 const char *src, size_t srcLen) const
165 {
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 for ( ;; )
213 {
214 // try to convert the current chunk
215 size_t lenChunk = MB2WC(NULL, src, 0);
216 if ( lenChunk == wxCONV_FAILED )
217 return wxCONV_FAILED;
218
219 lenChunk++; // for the L'\0' at the end of this chunk
220
221 dstWritten += lenChunk;
222
223 if ( lenChunk == 1 )
224 {
225 // nothing left in the input string, conversion succeeded
226 break;
227 }
228
229 if ( dst )
230 {
231 if ( dstWritten > dstLen )
232 return wxCONV_FAILED;
233
234 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
235 return wxCONV_FAILED;
236
237 dst += lenChunk;
238 }
239
240 if ( !srcEnd )
241 {
242 // we convert just one chunk in this case as this is the entire
243 // string anyhow
244 break;
245 }
246
247 // advance the input pointer past the end of this chunk
248 while ( NotAllNULs(src, nulLen) )
249 {
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
254 src += nulLen;
255 }
256
257 src += nulLen; // skipping over its terminator as well
258
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
262 if ( src >= srcEnd )
263 break;
264 }
265
266 return dstWritten;
267 }
268
269 size_t
270 wxMBConv::FromWChar(char *dst, size_t dstLen,
271 const wchar_t *src, size_t srcLen) const
272 {
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten = 0;
275
276 // make a copy of the input string unless it is already properly
277 // NUL-terminated
278 //
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp;
282 if ( srcLen == wxNO_LEN )
283 {
284 srcLen = wxWcslen(src) + 1;
285 }
286 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
287 {
288 // make a copy in order to properly NUL-terminate the string
289 bufTmp = wxWCharBuffer(srcLen);
290 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
291 src = bufTmp;
292 }
293
294 const size_t lenNul = GetMBNulLen();
295 for ( const wchar_t * const srcEnd = src + srcLen;
296 src < srcEnd;
297 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
298 {
299 // try to convert the current chunk
300 size_t lenChunk = WC2MB(NULL, src, 0);
301
302 if ( lenChunk == wxCONV_FAILED )
303 return wxCONV_FAILED;
304
305 lenChunk += lenNul;
306 dstWritten += lenChunk;
307
308 if ( dst )
309 {
310 if ( dstWritten > dstLen )
311 return wxCONV_FAILED;
312
313 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dst += lenChunk;
317 }
318 }
319
320 return dstWritten;
321 }
322
323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
324 {
325 size_t rc = ToWChar(outBuff, outLen, inBuff);
326 if ( rc != wxCONV_FAILED )
327 {
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
330 rc--;
331 }
332
333 return rc;
334 }
335
336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
337 {
338 size_t rc = FromWChar(outBuff, outLen, inBuff);
339 if ( rc != wxCONV_FAILED )
340 {
341 rc -= GetMBNulLen();
342 }
343
344 return rc;
345 }
346
347 wxMBConv::~wxMBConv()
348 {
349 // nothing to do here (necessary for Darwin linking probably)
350 }
351
352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
353 {
354 if ( psz )
355 {
356 // calculate the length of the buffer needed first
357 const size_t nLen = ToWChar(NULL, 0, psz);
358 if ( nLen != wxCONV_FAILED )
359 {
360 // now do the actual conversion
361 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
362
363 // +1 for the trailing NULL
364 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
365 return buf;
366 }
367 }
368
369 return wxWCharBuffer();
370 }
371
372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
373 {
374 if ( pwz )
375 {
376 const size_t nLen = FromWChar(NULL, 0, pwz);
377 if ( nLen != wxCONV_FAILED )
378 {
379 wxCharBuffer buf(nLen - 1);
380 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
381 return buf;
382 }
383 }
384
385 return wxCharBuffer();
386 }
387
388 const wxWCharBuffer
389 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
390 {
391 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
392 if ( dstLen != wxCONV_FAILED )
393 {
394 wxWCharBuffer wbuf(dstLen - 1);
395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
396 {
397 if ( outLen )
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412 }
413
414 const wxCharBuffer
415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
416 {
417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
418 if ( dstLen != wxCONV_FAILED )
419 {
420 // special case of empty input: can't allocate 0 size buffer below as
421 // wxCharBuffer insists on NUL-terminating it
422 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
423 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
424 {
425 if ( outLen )
426 {
427 *outLen = dstLen;
428
429 const size_t nulLen = GetMBNulLen();
430 if ( dstLen >= nulLen &&
431 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
432 {
433 // in this case the output is NUL-terminated and we're not
434 // supposed to count NUL
435 *outLen -= nulLen;
436 }
437 }
438
439 return buf;
440 }
441 }
442
443 if ( outLen )
444 *outLen = 0;
445
446 return wxCharBuffer();
447 }
448
449 // ----------------------------------------------------------------------------
450 // wxMBConvLibc
451 // ----------------------------------------------------------------------------
452
453 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
454 {
455 return wxMB2WC(buf, psz, n);
456 }
457
458 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
459 {
460 return wxWC2MB(buf, psz, n);
461 }
462
463 // ----------------------------------------------------------------------------
464 // wxConvBrokenFileNames
465 // ----------------------------------------------------------------------------
466
467 #ifdef __UNIX__
468
469 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
470 {
471 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
472 wxStricmp(charset, _T("UTF8")) == 0 )
473 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
474 else
475 m_conv = new wxCSConv(charset);
476 }
477
478 #endif // __UNIX__
479
480 // ----------------------------------------------------------------------------
481 // UTF-7
482 // ----------------------------------------------------------------------------
483
484 // Implementation (C) 2004 Fredrik Roubert
485
486 //
487 // BASE64 decoding table
488 //
489 static const unsigned char utf7unb64[] =
490 {
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
497 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
498 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
500 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
501 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
502 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
504 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
505 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
506 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
523 };
524
525 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
526 {
527 size_t len = 0;
528
529 while ( *psz && (!buf || (len < n)) )
530 {
531 unsigned char cc = *psz++;
532 if (cc != '+')
533 {
534 // plain ASCII char
535 if (buf)
536 *buf++ = cc;
537 len++;
538 }
539 else if (*psz == '-')
540 {
541 // encoded plus sign
542 if (buf)
543 *buf++ = cc;
544 len++;
545 psz++;
546 }
547 else // start of BASE64 encoded string
548 {
549 bool lsb, ok;
550 unsigned int d, l;
551 for ( ok = lsb = false, d = 0, l = 0;
552 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
553 psz++ )
554 {
555 d <<= 6;
556 d += cc;
557 for (l += 6; l >= 8; lsb = !lsb)
558 {
559 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
560 if (lsb)
561 {
562 if (buf)
563 *buf++ |= c;
564 len ++;
565 }
566 else
567 {
568 if (buf)
569 *buf = (wchar_t)(c << 8);
570 }
571
572 ok = true;
573 }
574 }
575
576 if ( !ok )
577 {
578 // in valid UTF7 we should have valid characters after '+'
579 return wxCONV_FAILED;
580 }
581
582 if (*psz == '-')
583 psz++;
584 }
585 }
586
587 if ( buf && (len < n) )
588 *buf = '\0';
589
590 return len;
591 }
592
593 //
594 // BASE64 encoding table
595 //
596 static const unsigned char utf7enb64[] =
597 {
598 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
599 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
600 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
601 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
602 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
603 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
604 'w', 'x', 'y', 'z', '0', '1', '2', '3',
605 '4', '5', '6', '7', '8', '9', '+', '/'
606 };
607
608 //
609 // UTF-7 encoding table
610 //
611 // 0 - Set D (directly encoded characters)
612 // 1 - Set O (optional direct characters)
613 // 2 - whitespace characters (optional)
614 // 3 - special characters
615 //
616 static const unsigned char utf7encode[128] =
617 {
618 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
619 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
620 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
622 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
624 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
626 };
627
628 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
629 {
630 size_t len = 0;
631
632 while (*psz && ((!buf) || (len < n)))
633 {
634 wchar_t cc = *psz++;
635 if (cc < 0x80 && utf7encode[cc] < 1)
636 {
637 // plain ASCII char
638 if (buf)
639 *buf++ = (char)cc;
640
641 len++;
642 }
643 #ifndef WC_UTF16
644 else if (((wxUint32)cc) > 0xffff)
645 {
646 // no surrogate pair generation (yet?)
647 return wxCONV_FAILED;
648 }
649 #endif
650 else
651 {
652 if (buf)
653 *buf++ = '+';
654
655 len++;
656 if (cc != '+')
657 {
658 // BASE64 encode string
659 unsigned int lsb, d, l;
660 for (d = 0, l = 0; /*nothing*/; psz++)
661 {
662 for (lsb = 0; lsb < 2; lsb ++)
663 {
664 d <<= 8;
665 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
666
667 for (l += 8; l >= 6; )
668 {
669 l -= 6;
670 if (buf)
671 *buf++ = utf7enb64[(d >> l) % 64];
672 len++;
673 }
674 }
675
676 cc = *psz;
677 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
678 break;
679 }
680
681 if (l != 0)
682 {
683 if (buf)
684 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
685
686 len++;
687 }
688 }
689
690 if (buf)
691 *buf++ = '-';
692 len++;
693 }
694 }
695
696 if (buf && (len < n))
697 *buf = 0;
698
699 return len;
700 }
701
702 // ----------------------------------------------------------------------------
703 // UTF-8
704 // ----------------------------------------------------------------------------
705
706 static const wxUint32 utf8_max[]=
707 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
708
709 // boundaries of the private use area we use to (temporarily) remap invalid
710 // characters invalid in a UTF-8 encoded string
711 const wxUint32 wxUnicodePUA = 0x100000;
712 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
713
714 // this table gives the length of the UTF-8 encoding from its first character:
715 const unsigned char tableUtf8Lengths[256] = {
716 // single-byte sequences (ASCII):
717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
725
726 // these are invalid:
727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
731 0, 0, // C0,C1
732
733 // two-byte sequences:
734 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
735 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
736
737 // three-byte sequences:
738 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
739
740 // four-byte sequences:
741 4, 4, 4, 4, 4, // F0..F4
742
743 // these are invalid again (5- or 6-byte
744 // sequences and sequences for code points
745 // above U+10FFFF, as restricted by RFC 3629):
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
747 };
748
749 size_t
750 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
751 const char *src, size_t srcLen) const
752 {
753 wchar_t *out = dstLen ? dst : NULL;
754 size_t written = 0;
755
756 if ( srcLen == wxNO_LEN )
757 srcLen = strlen(src) + 1;
758
759 for ( const char *p = src; ; p++ )
760 {
761 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
762 {
763 // all done successfully, just add the trailing NULL if we are not
764 // using explicit length
765 if ( srcLen == wxNO_LEN )
766 {
767 if ( out )
768 {
769 if ( !dstLen )
770 break;
771
772 *out = L'\0';
773 }
774
775 written++;
776 }
777
778 return written;
779 }
780
781 if ( out && !dstLen-- )
782 break;
783
784 wxUint32 code;
785 unsigned char c = *p;
786
787 if ( c < 0x80 )
788 {
789 if ( srcLen == 0 ) // the test works for wxNO_LEN too
790 break;
791
792 if ( srcLen != wxNO_LEN )
793 srcLen--;
794
795 code = c;
796 }
797 else
798 {
799 unsigned len = tableUtf8Lengths[c];
800 if ( !len )
801 break;
802
803 if ( srcLen < len ) // the test works for wxNO_LEN too
804 break;
805
806 if ( srcLen != wxNO_LEN )
807 srcLen -= len;
808
809 // Char. number range | UTF-8 octet sequence
810 // (hexadecimal) | (binary)
811 // ----------------------+----------------------------------------
812 // 0000 0000 - 0000 007F | 0xxxxxxx
813 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
814 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
815 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
816 //
817 // Code point value is stored in bits marked with 'x',
818 // lowest-order bit of the value on the right side in the diagram
819 // above. (from RFC 3629)
820
821 // mask to extract lead byte's value ('x' bits above), by sequence
822 // length:
823 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
824
825 // mask and value of lead byte's most significant bits, by length:
826 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
827 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
828
829 len--; // it's more convenient to work with 0-based length here
830
831 // extract the lead byte's value bits:
832 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
833 break;
834
835 code = c & leadValueMask[len];
836
837 // all remaining bytes, if any, are handled in the same way
838 // regardless of sequence's length:
839 for ( ; len; --len )
840 {
841 c = *++p;
842 if ( (c & 0xC0) != 0x80 )
843 return wxCONV_FAILED;
844
845 code <<= 6;
846 code |= c & 0x3F;
847 }
848 }
849
850 #ifdef WC_UTF16
851 // cast is ok because wchar_t == wxUint16 if WC_UTF16
852 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
853 {
854 if ( out )
855 out++;
856 written++;
857 }
858 #else // !WC_UTF16
859 if ( out )
860 *out = code;
861 #endif // WC_UTF16/!WC_UTF16
862
863 if ( out )
864 out++;
865
866 written++;
867 }
868
869 return wxCONV_FAILED;
870 }
871
872 size_t
873 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
874 const wchar_t *src, size_t srcLen) const
875 {
876 char *out = dstLen ? dst : NULL;
877 size_t written = 0;
878
879 for ( const wchar_t *wp = src; ; wp++ )
880 {
881 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
882 {
883 // all done successfully, just add the trailing NULL if we are not
884 // using explicit length
885 if ( srcLen == wxNO_LEN )
886 {
887 if ( out )
888 {
889 if ( !dstLen )
890 break;
891
892 *out = '\0';
893 }
894
895 written++;
896 }
897
898 return written;
899 }
900
901
902 wxUint32 code;
903 #ifdef WC_UTF16
904 // cast is ok for WC_UTF16
905 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
906 {
907 // skip the next char too as we decoded a surrogate
908 wp++;
909 }
910 #else // wchar_t is UTF-32
911 code = *wp & 0x7fffffff;
912 #endif
913
914 unsigned len;
915 if ( code <= 0x7F )
916 {
917 len = 1;
918 if ( out )
919 {
920 if ( dstLen < len )
921 break;
922
923 out[0] = (char)code;
924 }
925 }
926 else if ( code <= 0x07FF )
927 {
928 len = 2;
929 if ( out )
930 {
931 if ( dstLen < len )
932 break;
933
934 // NB: this line takes 6 least significant bits, encodes them as
935 // 10xxxxxx and discards them so that the next byte can be encoded:
936 out[1] = 0x80 | (code & 0x3F); code >>= 6;
937 out[0] = 0xC0 | code;
938 }
939 }
940 else if ( code < 0xFFFF )
941 {
942 len = 3;
943 if ( out )
944 {
945 if ( dstLen < len )
946 break;
947
948 out[2] = 0x80 | (code & 0x3F); code >>= 6;
949 out[1] = 0x80 | (code & 0x3F); code >>= 6;
950 out[0] = 0xE0 | code;
951 }
952 }
953 else if ( code <= 0x10FFFF )
954 {
955 len = 4;
956 if ( out )
957 {
958 if ( dstLen < len )
959 break;
960
961 out[3] = 0x80 | (code & 0x3F); code >>= 6;
962 out[2] = 0x80 | (code & 0x3F); code >>= 6;
963 out[1] = 0x80 | (code & 0x3F); code >>= 6;
964 out[0] = 0xF0 | code;
965 }
966 }
967 else
968 {
969 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
970 break;
971 }
972
973 if ( out )
974 {
975 out += len;
976 dstLen -= len;
977 }
978
979 written += len;
980 }
981
982 // we only get here if an error occurs during decoding
983 return wxCONV_FAILED;
984 }
985
986 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
987 {
988 if ( m_options == MAP_INVALID_UTF8_NOT )
989 return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
990
991 size_t len = 0;
992
993 while (*psz && ((!buf) || (len < n)))
994 {
995 const char *opsz = psz;
996 bool invalid = false;
997 unsigned char cc = *psz++, fc = cc;
998 unsigned cnt;
999 for (cnt = 0; fc & 0x80; cnt++)
1000 fc <<= 1;
1001
1002 if (!cnt)
1003 {
1004 // plain ASCII char
1005 if (buf)
1006 *buf++ = cc;
1007 len++;
1008
1009 // escape the escape character for octal escapes
1010 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1011 && cc == '\\' && (!buf || len < n))
1012 {
1013 if (buf)
1014 *buf++ = cc;
1015 len++;
1016 }
1017 }
1018 else
1019 {
1020 cnt--;
1021 if (!cnt)
1022 {
1023 // invalid UTF-8 sequence
1024 invalid = true;
1025 }
1026 else
1027 {
1028 unsigned ocnt = cnt - 1;
1029 wxUint32 res = cc & (0x3f >> cnt);
1030 while (cnt--)
1031 {
1032 cc = *psz;
1033 if ((cc & 0xC0) != 0x80)
1034 {
1035 // invalid UTF-8 sequence
1036 invalid = true;
1037 break;
1038 }
1039
1040 psz++;
1041 res = (res << 6) | (cc & 0x3f);
1042 }
1043
1044 if (invalid || res <= utf8_max[ocnt])
1045 {
1046 // illegal UTF-8 encoding
1047 invalid = true;
1048 }
1049 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1050 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1051 {
1052 // if one of our PUA characters turns up externally
1053 // it must also be treated as an illegal sequence
1054 // (a bit like you have to escape an escape character)
1055 invalid = true;
1056 }
1057 else
1058 {
1059 #ifdef WC_UTF16
1060 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1061 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1062 if (pa == wxCONV_FAILED)
1063 {
1064 invalid = true;
1065 }
1066 else
1067 {
1068 if (buf)
1069 buf += pa;
1070 len += pa;
1071 }
1072 #else // !WC_UTF16
1073 if (buf)
1074 *buf++ = (wchar_t)res;
1075 len++;
1076 #endif // WC_UTF16/!WC_UTF16
1077 }
1078 }
1079
1080 if (invalid)
1081 {
1082 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1083 {
1084 while (opsz < psz && (!buf || len < n))
1085 {
1086 #ifdef WC_UTF16
1087 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1088 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1089 wxASSERT(pa != wxCONV_FAILED);
1090 if (buf)
1091 buf += pa;
1092 opsz++;
1093 len += pa;
1094 #else
1095 if (buf)
1096 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1097 opsz++;
1098 len++;
1099 #endif
1100 }
1101 }
1102 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1103 {
1104 while (opsz < psz && (!buf || len < n))
1105 {
1106 if ( buf && len + 3 < n )
1107 {
1108 unsigned char on = *opsz;
1109 *buf++ = L'\\';
1110 *buf++ = (wchar_t)( L'0' + on / 0100 );
1111 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1112 *buf++ = (wchar_t)( L'0' + on % 010 );
1113 }
1114
1115 opsz++;
1116 len += 4;
1117 }
1118 }
1119 else // MAP_INVALID_UTF8_NOT
1120 {
1121 return wxCONV_FAILED;
1122 }
1123 }
1124 }
1125 }
1126
1127 if (buf && (len < n))
1128 *buf = 0;
1129
1130 return len;
1131 }
1132
1133 static inline bool isoctal(wchar_t wch)
1134 {
1135 return L'0' <= wch && wch <= L'7';
1136 }
1137
1138 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1139 {
1140 if ( m_options == MAP_INVALID_UTF8_NOT )
1141 return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
1142
1143 size_t len = 0;
1144
1145 while (*psz && ((!buf) || (len < n)))
1146 {
1147 wxUint32 cc;
1148
1149 #ifdef WC_UTF16
1150 // cast is ok for WC_UTF16
1151 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1152 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1153 #else
1154 cc = (*psz++) & 0x7fffffff;
1155 #endif
1156
1157 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1158 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1159 {
1160 if (buf)
1161 *buf++ = (char)(cc - wxUnicodePUA);
1162 len++;
1163 }
1164 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1165 && cc == L'\\' && psz[0] == L'\\' )
1166 {
1167 if (buf)
1168 *buf++ = (char)cc;
1169 psz++;
1170 len++;
1171 }
1172 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1173 cc == L'\\' &&
1174 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1175 {
1176 if (buf)
1177 {
1178 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1179 (psz[1] - L'0') * 010 +
1180 (psz[2] - L'0'));
1181 }
1182
1183 psz += 3;
1184 len++;
1185 }
1186 else
1187 {
1188 unsigned cnt;
1189 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1190 {
1191 }
1192
1193 if (!cnt)
1194 {
1195 // plain ASCII char
1196 if (buf)
1197 *buf++ = (char) cc;
1198 len++;
1199 }
1200 else
1201 {
1202 len += cnt + 1;
1203 if (buf)
1204 {
1205 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1206 while (cnt--)
1207 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1208 }
1209 }
1210 }
1211 }
1212
1213 if (buf && (len < n))
1214 *buf = 0;
1215
1216 return len;
1217 }
1218
1219 // ============================================================================
1220 // UTF-16
1221 // ============================================================================
1222
1223 #ifdef WORDS_BIGENDIAN
1224 #define wxMBConvUTF16straight wxMBConvUTF16BE
1225 #define wxMBConvUTF16swap wxMBConvUTF16LE
1226 #else
1227 #define wxMBConvUTF16swap wxMBConvUTF16BE
1228 #define wxMBConvUTF16straight wxMBConvUTF16LE
1229 #endif
1230
1231 /* static */
1232 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1233 {
1234 if ( srcLen == wxNO_LEN )
1235 {
1236 // count the number of bytes in input, including the trailing NULs
1237 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1238 for ( srcLen = 1; *inBuff++; srcLen++ )
1239 ;
1240
1241 srcLen *= BYTES_PER_CHAR;
1242 }
1243 else // we already have the length
1244 {
1245 // we can only convert an entire number of UTF-16 characters
1246 if ( srcLen % BYTES_PER_CHAR )
1247 return wxCONV_FAILED;
1248 }
1249
1250 return srcLen;
1251 }
1252
1253 // case when in-memory representation is UTF-16 too
1254 #ifdef WC_UTF16
1255
1256 // ----------------------------------------------------------------------------
1257 // conversions without endianness change
1258 // ----------------------------------------------------------------------------
1259
1260 size_t
1261 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1262 const char *src, size_t srcLen) const
1263 {
1264 // set up the scene for using memcpy() (which is presumably more efficient
1265 // than copying the bytes one by one)
1266 srcLen = GetLength(src, srcLen);
1267 if ( srcLen == wxNO_LEN )
1268 return wxCONV_FAILED;
1269
1270 const size_t inLen = srcLen / BYTES_PER_CHAR;
1271 if ( dst )
1272 {
1273 if ( dstLen < inLen )
1274 return wxCONV_FAILED;
1275
1276 memcpy(dst, src, srcLen);
1277 }
1278
1279 return inLen;
1280 }
1281
1282 size_t
1283 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1284 const wchar_t *src, size_t srcLen) const
1285 {
1286 if ( srcLen == wxNO_LEN )
1287 srcLen = wxWcslen(src) + 1;
1288
1289 srcLen *= BYTES_PER_CHAR;
1290
1291 if ( dst )
1292 {
1293 if ( dstLen < srcLen )
1294 return wxCONV_FAILED;
1295
1296 memcpy(dst, src, srcLen);
1297 }
1298
1299 return srcLen;
1300 }
1301
1302 // ----------------------------------------------------------------------------
1303 // endian-reversing conversions
1304 // ----------------------------------------------------------------------------
1305
1306 size_t
1307 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1308 const char *src, size_t srcLen) const
1309 {
1310 srcLen = GetLength(src, srcLen);
1311 if ( srcLen == wxNO_LEN )
1312 return wxCONV_FAILED;
1313
1314 srcLen /= BYTES_PER_CHAR;
1315
1316 if ( dst )
1317 {
1318 if ( dstLen < srcLen )
1319 return wxCONV_FAILED;
1320
1321 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1322 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1323 {
1324 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1325 }
1326 }
1327
1328 return srcLen;
1329 }
1330
1331 size_t
1332 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1333 const wchar_t *src, size_t srcLen) const
1334 {
1335 if ( srcLen == wxNO_LEN )
1336 srcLen = wxWcslen(src) + 1;
1337
1338 srcLen *= BYTES_PER_CHAR;
1339
1340 if ( dst )
1341 {
1342 if ( dstLen < srcLen )
1343 return wxCONV_FAILED;
1344
1345 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1346 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1347 {
1348 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1349 }
1350 }
1351
1352 return srcLen;
1353 }
1354
1355 #else // !WC_UTF16: wchar_t is UTF-32
1356
1357 // ----------------------------------------------------------------------------
1358 // conversions without endianness change
1359 // ----------------------------------------------------------------------------
1360
1361 size_t
1362 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1363 const char *src, size_t srcLen) const
1364 {
1365 srcLen = GetLength(src, srcLen);
1366 if ( srcLen == wxNO_LEN )
1367 return wxCONV_FAILED;
1368
1369 const size_t inLen = srcLen / BYTES_PER_CHAR;
1370 if ( !dst )
1371 {
1372 // optimization: return maximal space which could be needed for this
1373 // string even if the real size could be smaller if the buffer contains
1374 // any surrogates
1375 return inLen;
1376 }
1377
1378 size_t outLen = 0;
1379 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1380 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1381 {
1382 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1383 if ( !inBuff )
1384 return wxCONV_FAILED;
1385
1386 if ( ++outLen > dstLen )
1387 return wxCONV_FAILED;
1388
1389 *dst++ = ch;
1390 }
1391
1392
1393 return outLen;
1394 }
1395
1396 size_t
1397 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1398 const wchar_t *src, size_t srcLen) const
1399 {
1400 if ( srcLen == wxNO_LEN )
1401 srcLen = wxWcslen(src) + 1;
1402
1403 size_t outLen = 0;
1404 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1405 for ( size_t n = 0; n < srcLen; n++ )
1406 {
1407 wxUint16 cc[2];
1408 const size_t numChars = encode_utf16(*src++, cc);
1409 if ( numChars == wxCONV_FAILED )
1410 return wxCONV_FAILED;
1411
1412 outLen += numChars * BYTES_PER_CHAR;
1413 if ( outBuff )
1414 {
1415 if ( outLen > dstLen )
1416 return wxCONV_FAILED;
1417
1418 *outBuff++ = cc[0];
1419 if ( numChars == 2 )
1420 {
1421 // second character of a surrogate
1422 *outBuff++ = cc[1];
1423 }
1424 }
1425 }
1426
1427 return outLen;
1428 }
1429
1430 // ----------------------------------------------------------------------------
1431 // endian-reversing conversions
1432 // ----------------------------------------------------------------------------
1433
1434 size_t
1435 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1436 const char *src, size_t srcLen) const
1437 {
1438 srcLen = GetLength(src, srcLen);
1439 if ( srcLen == wxNO_LEN )
1440 return wxCONV_FAILED;
1441
1442 const size_t inLen = srcLen / BYTES_PER_CHAR;
1443 if ( !dst )
1444 {
1445 // optimization: return maximal space which could be needed for this
1446 // string even if the real size could be smaller if the buffer contains
1447 // any surrogates
1448 return inLen;
1449 }
1450
1451 size_t outLen = 0;
1452 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1453 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1454 {
1455 wxUint32 ch;
1456 wxUint16 tmp[2];
1457
1458 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1459 inBuff++;
1460 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1461
1462 const size_t numChars = decode_utf16(tmp, ch);
1463 if ( numChars == wxCONV_FAILED )
1464 return wxCONV_FAILED;
1465
1466 if ( numChars == 2 )
1467 inBuff++;
1468
1469 if ( ++outLen > dstLen )
1470 return wxCONV_FAILED;
1471
1472 *dst++ = ch;
1473 }
1474
1475
1476 return outLen;
1477 }
1478
1479 size_t
1480 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1481 const wchar_t *src, size_t srcLen) const
1482 {
1483 if ( srcLen == wxNO_LEN )
1484 srcLen = wxWcslen(src) + 1;
1485
1486 size_t outLen = 0;
1487 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1488 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1489 {
1490 wxUint16 cc[2];
1491 const size_t numChars = encode_utf16(*src, cc);
1492 if ( numChars == wxCONV_FAILED )
1493 return wxCONV_FAILED;
1494
1495 outLen += numChars * BYTES_PER_CHAR;
1496 if ( outBuff )
1497 {
1498 if ( outLen > dstLen )
1499 return wxCONV_FAILED;
1500
1501 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1502 if ( numChars == 2 )
1503 {
1504 // second character of a surrogate
1505 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1506 }
1507 }
1508 }
1509
1510 return outLen;
1511 }
1512
1513 #endif // WC_UTF16/!WC_UTF16
1514
1515
1516 // ============================================================================
1517 // UTF-32
1518 // ============================================================================
1519
1520 #ifdef WORDS_BIGENDIAN
1521 #define wxMBConvUTF32straight wxMBConvUTF32BE
1522 #define wxMBConvUTF32swap wxMBConvUTF32LE
1523 #else
1524 #define wxMBConvUTF32swap wxMBConvUTF32BE
1525 #define wxMBConvUTF32straight wxMBConvUTF32LE
1526 #endif
1527
1528
1529 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1530 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1531
1532 /* static */
1533 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1534 {
1535 if ( srcLen == wxNO_LEN )
1536 {
1537 // count the number of bytes in input, including the trailing NULs
1538 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1539 for ( srcLen = 1; *inBuff++; srcLen++ )
1540 ;
1541
1542 srcLen *= BYTES_PER_CHAR;
1543 }
1544 else // we already have the length
1545 {
1546 // we can only convert an entire number of UTF-32 characters
1547 if ( srcLen % BYTES_PER_CHAR )
1548 return wxCONV_FAILED;
1549 }
1550
1551 return srcLen;
1552 }
1553
1554 // case when in-memory representation is UTF-16
1555 #ifdef WC_UTF16
1556
1557 // ----------------------------------------------------------------------------
1558 // conversions without endianness change
1559 // ----------------------------------------------------------------------------
1560
1561 size_t
1562 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1563 const char *src, size_t srcLen) const
1564 {
1565 srcLen = GetLength(src, srcLen);
1566 if ( srcLen == wxNO_LEN )
1567 return wxCONV_FAILED;
1568
1569 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1570 const size_t inLen = srcLen / BYTES_PER_CHAR;
1571 size_t outLen = 0;
1572 for ( size_t n = 0; n < inLen; n++ )
1573 {
1574 wxUint16 cc[2];
1575 const size_t numChars = encode_utf16(*inBuff++, cc);
1576 if ( numChars == wxCONV_FAILED )
1577 return wxCONV_FAILED;
1578
1579 outLen += numChars;
1580 if ( dst )
1581 {
1582 if ( outLen > dstLen )
1583 return wxCONV_FAILED;
1584
1585 *dst++ = cc[0];
1586 if ( numChars == 2 )
1587 {
1588 // second character of a surrogate
1589 *dst++ = cc[1];
1590 }
1591 }
1592 }
1593
1594 return outLen;
1595 }
1596
1597 size_t
1598 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1599 const wchar_t *src, size_t srcLen) const
1600 {
1601 if ( srcLen == wxNO_LEN )
1602 srcLen = wxWcslen(src) + 1;
1603
1604 if ( !dst )
1605 {
1606 // optimization: return maximal space which could be needed for this
1607 // string instead of the exact amount which could be less if there are
1608 // any surrogates in the input
1609 //
1610 // we consider that surrogates are rare enough to make it worthwhile to
1611 // avoid running the loop below at the cost of slightly extra memory
1612 // consumption
1613 return srcLen * BYTES_PER_CHAR;
1614 }
1615
1616 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1617 size_t outLen = 0;
1618 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1619 {
1620 const wxUint32 ch = wxDecodeSurrogate(&src);
1621 if ( !src )
1622 return wxCONV_FAILED;
1623
1624 outLen += BYTES_PER_CHAR;
1625
1626 if ( outLen > dstLen )
1627 return wxCONV_FAILED;
1628
1629 *outBuff++ = ch;
1630 }
1631
1632 return outLen;
1633 }
1634
1635 // ----------------------------------------------------------------------------
1636 // endian-reversing conversions
1637 // ----------------------------------------------------------------------------
1638
1639 size_t
1640 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1641 const char *src, size_t srcLen) const
1642 {
1643 srcLen = GetLength(src, srcLen);
1644 if ( srcLen == wxNO_LEN )
1645 return wxCONV_FAILED;
1646
1647 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1648 const size_t inLen = srcLen / BYTES_PER_CHAR;
1649 size_t outLen = 0;
1650 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1651 {
1652 wxUint16 cc[2];
1653 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1654 if ( numChars == wxCONV_FAILED )
1655 return wxCONV_FAILED;
1656
1657 outLen += numChars;
1658 if ( dst )
1659 {
1660 if ( outLen > dstLen )
1661 return wxCONV_FAILED;
1662
1663 *dst++ = cc[0];
1664 if ( numChars == 2 )
1665 {
1666 // second character of a surrogate
1667 *dst++ = cc[1];
1668 }
1669 }
1670 }
1671
1672 return outLen;
1673 }
1674
1675 size_t
1676 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1677 const wchar_t *src, size_t srcLen) const
1678 {
1679 if ( srcLen == wxNO_LEN )
1680 srcLen = wxWcslen(src) + 1;
1681
1682 if ( !dst )
1683 {
1684 // optimization: return maximal space which could be needed for this
1685 // string instead of the exact amount which could be less if there are
1686 // any surrogates in the input
1687 //
1688 // we consider that surrogates are rare enough to make it worthwhile to
1689 // avoid running the loop below at the cost of slightly extra memory
1690 // consumption
1691 return srcLen*BYTES_PER_CHAR;
1692 }
1693
1694 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1695 size_t outLen = 0;
1696 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1697 {
1698 const wxUint32 ch = wxDecodeSurrogate(&src);
1699 if ( !src )
1700 return wxCONV_FAILED;
1701
1702 outLen += BYTES_PER_CHAR;
1703
1704 if ( outLen > dstLen )
1705 return wxCONV_FAILED;
1706
1707 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1708 }
1709
1710 return outLen;
1711 }
1712
1713 #else // !WC_UTF16: wchar_t is UTF-32
1714
1715 // ----------------------------------------------------------------------------
1716 // conversions without endianness change
1717 // ----------------------------------------------------------------------------
1718
1719 size_t
1720 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1721 const char *src, size_t srcLen) const
1722 {
1723 // use memcpy() as it should be much faster than hand-written loop
1724 srcLen = GetLength(src, srcLen);
1725 if ( srcLen == wxNO_LEN )
1726 return wxCONV_FAILED;
1727
1728 const size_t inLen = srcLen/BYTES_PER_CHAR;
1729 if ( dst )
1730 {
1731 if ( dstLen < inLen )
1732 return wxCONV_FAILED;
1733
1734 memcpy(dst, src, srcLen);
1735 }
1736
1737 return inLen;
1738 }
1739
1740 size_t
1741 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1742 const wchar_t *src, size_t srcLen) const
1743 {
1744 if ( srcLen == wxNO_LEN )
1745 srcLen = wxWcslen(src) + 1;
1746
1747 srcLen *= BYTES_PER_CHAR;
1748
1749 if ( dst )
1750 {
1751 if ( dstLen < srcLen )
1752 return wxCONV_FAILED;
1753
1754 memcpy(dst, src, srcLen);
1755 }
1756
1757 return srcLen;
1758 }
1759
1760 // ----------------------------------------------------------------------------
1761 // endian-reversing conversions
1762 // ----------------------------------------------------------------------------
1763
1764 size_t
1765 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1766 const char *src, size_t srcLen) const
1767 {
1768 srcLen = GetLength(src, srcLen);
1769 if ( srcLen == wxNO_LEN )
1770 return wxCONV_FAILED;
1771
1772 srcLen /= BYTES_PER_CHAR;
1773
1774 if ( dst )
1775 {
1776 if ( dstLen < srcLen )
1777 return wxCONV_FAILED;
1778
1779 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1780 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1781 {
1782 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1783 }
1784 }
1785
1786 return srcLen;
1787 }
1788
1789 size_t
1790 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1791 const wchar_t *src, size_t srcLen) const
1792 {
1793 if ( srcLen == wxNO_LEN )
1794 srcLen = wxWcslen(src) + 1;
1795
1796 srcLen *= BYTES_PER_CHAR;
1797
1798 if ( dst )
1799 {
1800 if ( dstLen < srcLen )
1801 return wxCONV_FAILED;
1802
1803 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1804 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1805 {
1806 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1807 }
1808 }
1809
1810 return srcLen;
1811 }
1812
1813 #endif // WC_UTF16/!WC_UTF16
1814
1815
1816 // ============================================================================
1817 // The classes doing conversion using the iconv_xxx() functions
1818 // ============================================================================
1819
1820 #ifdef HAVE_ICONV
1821
1822 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1823 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1824 // (unless there's yet another bug in glibc) the only case when iconv()
1825 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1826 // left in the input buffer -- when _real_ error occurs,
1827 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1828 // iconv() failure.
1829 // [This bug does not appear in glibc 2.2.]
1830 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1831 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1832 (errno != E2BIG || bufLeft != 0))
1833 #else
1834 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1835 #endif
1836
1837 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1838
1839 #define ICONV_T_INVALID ((iconv_t)-1)
1840
1841 #if SIZEOF_WCHAR_T == 4
1842 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1843 #define WC_ENC wxFONTENCODING_UTF32
1844 #elif SIZEOF_WCHAR_T == 2
1845 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1846 #define WC_ENC wxFONTENCODING_UTF16
1847 #else // sizeof(wchar_t) != 2 nor 4
1848 // does this ever happen?
1849 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1850 #endif
1851
1852 // ----------------------------------------------------------------------------
1853 // wxMBConv_iconv: encapsulates an iconv character set
1854 // ----------------------------------------------------------------------------
1855
1856 class wxMBConv_iconv : public wxMBConv
1857 {
1858 public:
1859 wxMBConv_iconv(const char *name);
1860 virtual ~wxMBConv_iconv();
1861
1862 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1863 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1864
1865 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1866 virtual size_t GetMBNulLen() const;
1867
1868 #if wxUSE_UNICODE_UTF8
1869 virtual bool IsUTF8() const;
1870 #endif
1871
1872 virtual wxMBConv *Clone() const
1873 {
1874 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1875 p->m_minMBCharWidth = m_minMBCharWidth;
1876 return p;
1877 }
1878
1879 bool IsOk() const
1880 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1881
1882 protected:
1883 // the iconv handlers used to translate from multibyte
1884 // to wide char and in the other direction
1885 iconv_t m2w,
1886 w2m;
1887
1888 #if wxUSE_THREADS
1889 // guards access to m2w and w2m objects
1890 wxMutex m_iconvMutex;
1891 #endif
1892
1893 private:
1894 // the name (for iconv_open()) of a wide char charset -- if none is
1895 // available on this machine, it will remain NULL
1896 static wxString ms_wcCharsetName;
1897
1898 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1899 // different endian-ness than the native one
1900 static bool ms_wcNeedsSwap;
1901
1902
1903 // name of the encoding handled by this conversion
1904 wxString m_name;
1905
1906 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1907 // initially
1908 size_t m_minMBCharWidth;
1909 };
1910
1911 // make the constructor available for unit testing
1912 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1913 {
1914 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1915 if ( !result->IsOk() )
1916 {
1917 delete result;
1918 return 0;
1919 }
1920
1921 return result;
1922 }
1923
1924 wxString wxMBConv_iconv::ms_wcCharsetName;
1925 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1926
1927 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1928 : m_name(name)
1929 {
1930 m_minMBCharWidth = 0;
1931
1932 // check for charset that represents wchar_t:
1933 if ( ms_wcCharsetName.empty() )
1934 {
1935 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1936
1937 #if wxUSE_FONTMAP
1938 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1939 #else // !wxUSE_FONTMAP
1940 static const wxChar *names_static[] =
1941 {
1942 #if SIZEOF_WCHAR_T == 4
1943 _T("UCS-4"),
1944 #elif SIZEOF_WCHAR_T = 2
1945 _T("UCS-2"),
1946 #endif
1947 NULL
1948 };
1949 const wxChar **names = names_static;
1950 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1951
1952 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1953 {
1954 const wxString nameCS(*names);
1955
1956 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1957 wxString nameXE(nameCS);
1958
1959 #ifdef WORDS_BIGENDIAN
1960 nameXE += _T("BE");
1961 #else // little endian
1962 nameXE += _T("LE");
1963 #endif
1964
1965 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1966 nameXE.c_str());
1967
1968 m2w = iconv_open(nameXE.ToAscii(), name);
1969 if ( m2w == ICONV_T_INVALID )
1970 {
1971 // try charset w/o bytesex info (e.g. "UCS4")
1972 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1973 nameCS.c_str());
1974 m2w = iconv_open(nameCS.ToAscii(), name);
1975
1976 // and check for bytesex ourselves:
1977 if ( m2w != ICONV_T_INVALID )
1978 {
1979 char buf[2], *bufPtr;
1980 wchar_t wbuf[2], *wbufPtr;
1981 size_t insz, outsz;
1982 size_t res;
1983
1984 buf[0] = 'A';
1985 buf[1] = 0;
1986 wbuf[0] = 0;
1987 insz = 2;
1988 outsz = SIZEOF_WCHAR_T * 2;
1989 wbufPtr = wbuf;
1990 bufPtr = buf;
1991
1992 res = iconv(
1993 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1994 (char**)&wbufPtr, &outsz);
1995
1996 if (ICONV_FAILED(res, insz))
1997 {
1998 wxLogLastError(wxT("iconv"));
1999 wxLogError(_("Conversion to charset '%s' doesn't work."),
2000 nameCS.c_str());
2001 }
2002 else // ok, can convert to this encoding, remember it
2003 {
2004 ms_wcCharsetName = nameCS;
2005 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2006 }
2007 }
2008 }
2009 else // use charset not requiring byte swapping
2010 {
2011 ms_wcCharsetName = nameXE;
2012 }
2013 }
2014
2015 wxLogTrace(TRACE_STRCONV,
2016 wxT("iconv wchar_t charset is \"%s\"%s"),
2017 ms_wcCharsetName.empty() ? wxString("<none>")
2018 : ms_wcCharsetName,
2019 ms_wcNeedsSwap ? _T(" (needs swap)")
2020 : _T(""));
2021 }
2022 else // we already have ms_wcCharsetName
2023 {
2024 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2025 }
2026
2027 if ( ms_wcCharsetName.empty() )
2028 {
2029 w2m = ICONV_T_INVALID;
2030 }
2031 else
2032 {
2033 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2034 if ( w2m == ICONV_T_INVALID )
2035 {
2036 wxLogTrace(TRACE_STRCONV,
2037 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2038 ms_wcCharsetName.c_str(), name);
2039 }
2040 }
2041 }
2042
2043 wxMBConv_iconv::~wxMBConv_iconv()
2044 {
2045 if ( m2w != ICONV_T_INVALID )
2046 iconv_close(m2w);
2047 if ( w2m != ICONV_T_INVALID )
2048 iconv_close(w2m);
2049 }
2050
2051 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2052 {
2053 // find the string length: notice that must be done differently for
2054 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2055 size_t inbuf;
2056 const size_t nulLen = GetMBNulLen();
2057 switch ( nulLen )
2058 {
2059 default:
2060 return wxCONV_FAILED;
2061
2062 case 1:
2063 inbuf = strlen(psz); // arguably more optimized than our version
2064 break;
2065
2066 case 2:
2067 case 4:
2068 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2069 // they also have to start at character boundary and not span two
2070 // adjacent characters
2071 const char *p;
2072 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2073 ;
2074 inbuf = p - psz;
2075 break;
2076 }
2077
2078 #if wxUSE_THREADS
2079 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2080 // Unfortunately there are a couple of global wxCSConv objects such as
2081 // wxConvLocal that are used all over wx code, so we have to make sure
2082 // the handle is used by at most one thread at the time. Otherwise
2083 // only a few wx classes would be safe to use from non-main threads
2084 // as MB<->WC conversion would fail "randomly".
2085 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2086 #endif // wxUSE_THREADS
2087
2088 size_t outbuf = n * SIZEOF_WCHAR_T;
2089 size_t res, cres;
2090 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2091 wchar_t *bufPtr = buf;
2092 const char *pszPtr = psz;
2093
2094 if (buf)
2095 {
2096 // have destination buffer, convert there
2097 cres = iconv(m2w,
2098 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2099 (char**)&bufPtr, &outbuf);
2100 res = n - (outbuf / SIZEOF_WCHAR_T);
2101
2102 if (ms_wcNeedsSwap)
2103 {
2104 // convert to native endianness
2105 for ( unsigned i = 0; i < res; i++ )
2106 buf[n] = WC_BSWAP(buf[i]);
2107 }
2108
2109 // NUL-terminate the string if there is any space left
2110 if (res < n)
2111 buf[res] = 0;
2112 }
2113 else
2114 {
2115 // no destination buffer... convert using temp buffer
2116 // to calculate destination buffer requirement
2117 wchar_t tbuf[8];
2118 res = 0;
2119
2120 do
2121 {
2122 bufPtr = tbuf;
2123 outbuf = 8 * SIZEOF_WCHAR_T;
2124
2125 cres = iconv(m2w,
2126 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2127 (char**)&bufPtr, &outbuf );
2128
2129 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2130 }
2131 while ((cres == (size_t)-1) && (errno == E2BIG));
2132 }
2133
2134 if (ICONV_FAILED(cres, inbuf))
2135 {
2136 //VS: it is ok if iconv fails, hence trace only
2137 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2138 return wxCONV_FAILED;
2139 }
2140
2141 return res;
2142 }
2143
2144 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2145 {
2146 #if wxUSE_THREADS
2147 // NB: explained in MB2WC
2148 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2149 #endif
2150
2151 size_t inlen = wxWcslen(psz);
2152 size_t inbuf = inlen * SIZEOF_WCHAR_T;
2153 size_t outbuf = n;
2154 size_t res, cres;
2155
2156 wchar_t *tmpbuf = 0;
2157
2158 if (ms_wcNeedsSwap)
2159 {
2160 // need to copy to temp buffer to switch endianness
2161 // (doing WC_BSWAP twice on the original buffer won't help, as it
2162 // could be in read-only memory, or be accessed in some other thread)
2163 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2164 for ( size_t i = 0; i < inlen; i++ )
2165 tmpbuf[n] = WC_BSWAP(psz[i]);
2166
2167 tmpbuf[inlen] = L'\0';
2168 psz = tmpbuf;
2169 }
2170
2171 if (buf)
2172 {
2173 // have destination buffer, convert there
2174 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2175
2176 res = n - outbuf;
2177
2178 // NB: iconv was given only wcslen(psz) characters on input, and so
2179 // it couldn't convert the trailing zero. Let's do it ourselves
2180 // if there's some room left for it in the output buffer.
2181 if (res < n)
2182 buf[0] = 0;
2183 }
2184 else
2185 {
2186 // no destination buffer: convert using temp buffer
2187 // to calculate destination buffer requirement
2188 char tbuf[16];
2189 res = 0;
2190 do
2191 {
2192 buf = tbuf;
2193 outbuf = 16;
2194
2195 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2196
2197 res += 16 - outbuf;
2198 }
2199 while ((cres == (size_t)-1) && (errno == E2BIG));
2200 }
2201
2202 if (ms_wcNeedsSwap)
2203 {
2204 free(tmpbuf);
2205 }
2206
2207 if (ICONV_FAILED(cres, inbuf))
2208 {
2209 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2210 return wxCONV_FAILED;
2211 }
2212
2213 return res;
2214 }
2215
2216 size_t wxMBConv_iconv::GetMBNulLen() const
2217 {
2218 if ( m_minMBCharWidth == 0 )
2219 {
2220 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2221
2222 #if wxUSE_THREADS
2223 // NB: explained in MB2WC
2224 wxMutexLocker lock(self->m_iconvMutex);
2225 #endif
2226
2227 const wchar_t *wnul = L"";
2228 char buf[8]; // should be enough for NUL in any encoding
2229 size_t inLen = sizeof(wchar_t),
2230 outLen = WXSIZEOF(buf);
2231 char *inBuff = (char *)wnul;
2232 char *outBuff = buf;
2233 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2234 {
2235 self->m_minMBCharWidth = (size_t)-1;
2236 }
2237 else // ok
2238 {
2239 self->m_minMBCharWidth = outBuff - buf;
2240 }
2241 }
2242
2243 return m_minMBCharWidth;
2244 }
2245
2246 #if wxUSE_UNICODE_UTF8
2247 bool wxMBConv_iconv::IsUTF8() const
2248 {
2249 return wxStricmp(m_name, "UTF-8") == 0 ||
2250 wxStricmp(m_name, "UTF8") == 0;
2251 }
2252 #endif
2253
2254 #endif // HAVE_ICONV
2255
2256
2257 // ============================================================================
2258 // Win32 conversion classes
2259 // ============================================================================
2260
2261 #ifdef wxHAVE_WIN32_MB2WC
2262
2263 // from utils.cpp
2264 #if wxUSE_FONTMAP
2265 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2266 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2267 #endif
2268
2269 class wxMBConv_win32 : public wxMBConv
2270 {
2271 public:
2272 wxMBConv_win32()
2273 {
2274 m_CodePage = CP_ACP;
2275 m_minMBCharWidth = 0;
2276 }
2277
2278 wxMBConv_win32(const wxMBConv_win32& conv)
2279 : wxMBConv()
2280 {
2281 m_CodePage = conv.m_CodePage;
2282 m_minMBCharWidth = conv.m_minMBCharWidth;
2283 }
2284
2285 #if wxUSE_FONTMAP
2286 wxMBConv_win32(const char* name)
2287 {
2288 m_CodePage = wxCharsetToCodepage(name);
2289 m_minMBCharWidth = 0;
2290 }
2291
2292 wxMBConv_win32(wxFontEncoding encoding)
2293 {
2294 m_CodePage = wxEncodingToCodepage(encoding);
2295 m_minMBCharWidth = 0;
2296 }
2297 #endif // wxUSE_FONTMAP
2298
2299 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2300 {
2301 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2302 // the behaviour is not compatible with the Unix version (using iconv)
2303 // and break the library itself, e.g. wxTextInputStream::NextChar()
2304 // wouldn't work if reading an incomplete MB char didn't result in an
2305 // error
2306 //
2307 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2308 // Win XP or newer and it is not supported for UTF-[78] so we always
2309 // use our own conversions in this case. See
2310 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2311 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2312 if ( m_CodePage == CP_UTF8 )
2313 {
2314 return wxMBConvUTF8().MB2WC(buf, psz, n);
2315 }
2316
2317 if ( m_CodePage == CP_UTF7 )
2318 {
2319 return wxMBConvUTF7().MB2WC(buf, psz, n);
2320 }
2321
2322 int flags = 0;
2323 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2324 IsAtLeastWin2kSP4() )
2325 {
2326 flags = MB_ERR_INVALID_CHARS;
2327 }
2328
2329 const size_t len = ::MultiByteToWideChar
2330 (
2331 m_CodePage, // code page
2332 flags, // flags: fall on error
2333 psz, // input string
2334 -1, // its length (NUL-terminated)
2335 buf, // output string
2336 buf ? n : 0 // size of output buffer
2337 );
2338 if ( !len )
2339 {
2340 // function totally failed
2341 return wxCONV_FAILED;
2342 }
2343
2344 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2345 // check if we succeeded, by doing a double trip:
2346 if ( !flags && buf )
2347 {
2348 const size_t mbLen = strlen(psz);
2349 wxCharBuffer mbBuf(mbLen);
2350 if ( ::WideCharToMultiByte
2351 (
2352 m_CodePage,
2353 0,
2354 buf,
2355 -1,
2356 mbBuf.data(),
2357 mbLen + 1, // size in bytes, not length
2358 NULL,
2359 NULL
2360 ) == 0 ||
2361 strcmp(mbBuf, psz) != 0 )
2362 {
2363 // we didn't obtain the same thing we started from, hence
2364 // the conversion was lossy and we consider that it failed
2365 return wxCONV_FAILED;
2366 }
2367 }
2368
2369 // note that it returns count of written chars for buf != NULL and size
2370 // of the needed buffer for buf == NULL so in either case the length of
2371 // the string (which never includes the terminating NUL) is one less
2372 return len - 1;
2373 }
2374
2375 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2376 {
2377 /*
2378 we have a problem here: by default, WideCharToMultiByte() may
2379 replace characters unrepresentable in the target code page with bad
2380 quality approximations such as turning "1/2" symbol (U+00BD) into
2381 "1" for the code pages which don't have it and we, obviously, want
2382 to avoid this at any price
2383
2384 the trouble is that this function does it _silently_, i.e. it won't
2385 even tell us whether it did or not... Win98/2000 and higher provide
2386 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2387 we have to resort to a round trip, i.e. check that converting back
2388 results in the same string -- this is, of course, expensive but
2389 otherwise we simply can't be sure to not garble the data.
2390 */
2391
2392 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2393 // it doesn't work with CJK encodings (which we test for rather roughly
2394 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2395 // supporting it
2396 BOOL usedDef wxDUMMY_INITIALIZE(false);
2397 BOOL *pUsedDef;
2398 int flags;
2399 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2400 {
2401 // it's our lucky day
2402 flags = WC_NO_BEST_FIT_CHARS;
2403 pUsedDef = &usedDef;
2404 }
2405 else // old system or unsupported encoding
2406 {
2407 flags = 0;
2408 pUsedDef = NULL;
2409 }
2410
2411 const size_t len = ::WideCharToMultiByte
2412 (
2413 m_CodePage, // code page
2414 flags, // either none or no best fit
2415 pwz, // input string
2416 -1, // it is (wide) NUL-terminated
2417 buf, // output buffer
2418 buf ? n : 0, // and its size
2419 NULL, // default "replacement" char
2420 pUsedDef // [out] was it used?
2421 );
2422
2423 if ( !len )
2424 {
2425 // function totally failed
2426 return wxCONV_FAILED;
2427 }
2428
2429 // if we were really converting, check if we succeeded
2430 if ( buf )
2431 {
2432 if ( flags )
2433 {
2434 // check if the conversion failed, i.e. if any replacements
2435 // were done
2436 if ( usedDef )
2437 return wxCONV_FAILED;
2438 }
2439 else // we must resort to double tripping...
2440 {
2441 wxWCharBuffer wcBuf(n);
2442 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2443 wcscmp(wcBuf, pwz) != 0 )
2444 {
2445 // we didn't obtain the same thing we started from, hence
2446 // the conversion was lossy and we consider that it failed
2447 return wxCONV_FAILED;
2448 }
2449 }
2450 }
2451
2452 // see the comment above for the reason of "len - 1"
2453 return len - 1;
2454 }
2455
2456 virtual size_t GetMBNulLen() const
2457 {
2458 if ( m_minMBCharWidth == 0 )
2459 {
2460 int len = ::WideCharToMultiByte
2461 (
2462 m_CodePage, // code page
2463 0, // no flags
2464 L"", // input string
2465 1, // translate just the NUL
2466 NULL, // output buffer
2467 0, // and its size
2468 NULL, // no replacement char
2469 NULL // [out] don't care if it was used
2470 );
2471
2472 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2473 switch ( len )
2474 {
2475 default:
2476 wxLogDebug(_T("Unexpected NUL length %d"), len);
2477 self->m_minMBCharWidth = (size_t)-1;
2478 break;
2479
2480 case 0:
2481 self->m_minMBCharWidth = (size_t)-1;
2482 break;
2483
2484 case 1:
2485 case 2:
2486 case 4:
2487 self->m_minMBCharWidth = len;
2488 break;
2489 }
2490 }
2491
2492 return m_minMBCharWidth;
2493 }
2494
2495 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2496
2497 bool IsOk() const { return m_CodePage != -1; }
2498
2499 private:
2500 static bool CanUseNoBestFit()
2501 {
2502 static int s_isWin98Or2k = -1;
2503
2504 if ( s_isWin98Or2k == -1 )
2505 {
2506 int verMaj, verMin;
2507 switch ( wxGetOsVersion(&verMaj, &verMin) )
2508 {
2509 case wxOS_WINDOWS_9X:
2510 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2511 break;
2512
2513 case wxOS_WINDOWS_NT:
2514 s_isWin98Or2k = verMaj >= 5;
2515 break;
2516
2517 default:
2518 // unknown: be conservative by default
2519 s_isWin98Or2k = 0;
2520 break;
2521 }
2522
2523 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2524 }
2525
2526 return s_isWin98Or2k == 1;
2527 }
2528
2529 static bool IsAtLeastWin2kSP4()
2530 {
2531 #ifdef __WXWINCE__
2532 return false;
2533 #else
2534 static int s_isAtLeastWin2kSP4 = -1;
2535
2536 if ( s_isAtLeastWin2kSP4 == -1 )
2537 {
2538 OSVERSIONINFOEX ver;
2539
2540 memset(&ver, 0, sizeof(ver));
2541 ver.dwOSVersionInfoSize = sizeof(ver);
2542 GetVersionEx((OSVERSIONINFO*)&ver);
2543
2544 s_isAtLeastWin2kSP4 =
2545 ((ver.dwMajorVersion > 5) || // Vista+
2546 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2547 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2548 ver.wServicePackMajor >= 4)) // 2000 SP4+
2549 ? 1 : 0;
2550 }
2551
2552 return s_isAtLeastWin2kSP4 == 1;
2553 #endif
2554 }
2555
2556
2557 // the code page we're working with
2558 long m_CodePage;
2559
2560 // cached result of GetMBNulLen(), set to 0 initially meaning
2561 // "unknown"
2562 size_t m_minMBCharWidth;
2563 };
2564
2565 #endif // wxHAVE_WIN32_MB2WC
2566
2567
2568 // ============================================================================
2569 // wxEncodingConverter based conversion classes
2570 // ============================================================================
2571
2572 #if wxUSE_FONTMAP
2573
2574 class wxMBConv_wxwin : public wxMBConv
2575 {
2576 private:
2577 void Init()
2578 {
2579 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2580 // The wxMBConv_cf class does a better job.
2581 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2582 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2583 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2584 }
2585
2586 public:
2587 // temporarily just use wxEncodingConverter stuff,
2588 // so that it works while a better implementation is built
2589 wxMBConv_wxwin(const char* name)
2590 {
2591 if (name)
2592 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2593 else
2594 m_enc = wxFONTENCODING_SYSTEM;
2595
2596 Init();
2597 }
2598
2599 wxMBConv_wxwin(wxFontEncoding enc)
2600 {
2601 m_enc = enc;
2602
2603 Init();
2604 }
2605
2606 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2607 {
2608 size_t inbuf = strlen(psz);
2609 if (buf)
2610 {
2611 if (!m2w.Convert(psz, buf))
2612 return wxCONV_FAILED;
2613 }
2614 return inbuf;
2615 }
2616
2617 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2618 {
2619 const size_t inbuf = wxWcslen(psz);
2620 if (buf)
2621 {
2622 if (!w2m.Convert(psz, buf))
2623 return wxCONV_FAILED;
2624 }
2625
2626 return inbuf;
2627 }
2628
2629 virtual size_t GetMBNulLen() const
2630 {
2631 switch ( m_enc )
2632 {
2633 case wxFONTENCODING_UTF16BE:
2634 case wxFONTENCODING_UTF16LE:
2635 return 2;
2636
2637 case wxFONTENCODING_UTF32BE:
2638 case wxFONTENCODING_UTF32LE:
2639 return 4;
2640
2641 default:
2642 return 1;
2643 }
2644 }
2645
2646 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2647
2648 bool IsOk() const { return m_ok; }
2649
2650 public:
2651 wxFontEncoding m_enc;
2652 wxEncodingConverter m2w, w2m;
2653
2654 private:
2655 // were we initialized successfully?
2656 bool m_ok;
2657
2658 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2659 };
2660
2661 // make the constructors available for unit testing
2662 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2663 {
2664 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2665 if ( !result->IsOk() )
2666 {
2667 delete result;
2668 return 0;
2669 }
2670
2671 return result;
2672 }
2673
2674 #endif // wxUSE_FONTMAP
2675
2676 // ============================================================================
2677 // wxCSConv implementation
2678 // ============================================================================
2679
2680 void wxCSConv::Init()
2681 {
2682 m_name = NULL;
2683 m_convReal = NULL;
2684 m_deferred = true;
2685 }
2686
2687 wxCSConv::wxCSConv(const wxString& charset)
2688 {
2689 Init();
2690
2691 if ( !charset.empty() )
2692 {
2693 SetName(charset.ToAscii());
2694 }
2695
2696 #if wxUSE_FONTMAP
2697 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2698 #else
2699 m_encoding = wxFONTENCODING_SYSTEM;
2700 #endif
2701 }
2702
2703 wxCSConv::wxCSConv(wxFontEncoding encoding)
2704 {
2705 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2706 {
2707 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2708
2709 encoding = wxFONTENCODING_SYSTEM;
2710 }
2711
2712 Init();
2713
2714 m_encoding = encoding;
2715 }
2716
2717 wxCSConv::~wxCSConv()
2718 {
2719 Clear();
2720 }
2721
2722 wxCSConv::wxCSConv(const wxCSConv& conv)
2723 : wxMBConv()
2724 {
2725 Init();
2726
2727 SetName(conv.m_name);
2728 m_encoding = conv.m_encoding;
2729 }
2730
2731 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2732 {
2733 Clear();
2734
2735 SetName(conv.m_name);
2736 m_encoding = conv.m_encoding;
2737
2738 return *this;
2739 }
2740
2741 void wxCSConv::Clear()
2742 {
2743 free(m_name);
2744 delete m_convReal;
2745
2746 m_name = NULL;
2747 m_convReal = NULL;
2748 }
2749
2750 void wxCSConv::SetName(const char *charset)
2751 {
2752 if (charset)
2753 {
2754 m_name = wxStrdup(charset);
2755 m_deferred = true;
2756 }
2757 }
2758
2759 #if wxUSE_FONTMAP
2760
2761 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2762 wxEncodingNameCache );
2763
2764 static wxEncodingNameCache gs_nameCache;
2765 #endif
2766
2767 wxMBConv *wxCSConv::DoCreate() const
2768 {
2769 #if wxUSE_FONTMAP
2770 wxLogTrace(TRACE_STRCONV,
2771 wxT("creating conversion for %s"),
2772 (m_name ? m_name
2773 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2774 #endif // wxUSE_FONTMAP
2775
2776 // check for the special case of ASCII or ISO8859-1 charset: as we have
2777 // special knowledge of it anyhow, we don't need to create a special
2778 // conversion object
2779 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2780 m_encoding == wxFONTENCODING_DEFAULT )
2781 {
2782 // don't convert at all
2783 return NULL;
2784 }
2785
2786 // we trust OS to do conversion better than we can so try external
2787 // conversion methods first
2788 //
2789 // the full order is:
2790 // 1. OS conversion (iconv() under Unix or Win32 API)
2791 // 2. hard coded conversions for UTF
2792 // 3. wxEncodingConverter as fall back
2793
2794 // step (1)
2795 #ifdef HAVE_ICONV
2796 #if !wxUSE_FONTMAP
2797 if ( m_name )
2798 #endif // !wxUSE_FONTMAP
2799 {
2800 #if wxUSE_FONTMAP
2801 wxFontEncoding encoding(m_encoding);
2802 #endif
2803
2804 if ( m_name )
2805 {
2806 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2807 if ( conv->IsOk() )
2808 return conv;
2809
2810 delete conv;
2811
2812 #if wxUSE_FONTMAP
2813 encoding =
2814 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2815 #endif // wxUSE_FONTMAP
2816 }
2817 #if wxUSE_FONTMAP
2818 {
2819 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2820 if ( it != gs_nameCache.end() )
2821 {
2822 if ( it->second.empty() )
2823 return NULL;
2824
2825 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2826 if ( conv->IsOk() )
2827 return conv;
2828
2829 delete conv;
2830 }
2831
2832 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2833 // CS : in case this does not return valid names (eg for MacRoman)
2834 // encoding got a 'failure' entry in the cache all the same,
2835 // although it just has to be created using a different method, so
2836 // only store failed iconv creation attempts (or perhaps we
2837 // shoulnd't do this at all ?)
2838 if ( names[0] != NULL )
2839 {
2840 for ( ; *names; ++names )
2841 {
2842 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2843 // will need changes that will obsolete this
2844 wxString name(*names);
2845 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2846 if ( conv->IsOk() )
2847 {
2848 gs_nameCache[encoding] = *names;
2849 return conv;
2850 }
2851
2852 delete conv;
2853 }
2854
2855 gs_nameCache[encoding] = _T(""); // cache the failure
2856 }
2857 }
2858 #endif // wxUSE_FONTMAP
2859 }
2860 #endif // HAVE_ICONV
2861
2862 #ifdef wxHAVE_WIN32_MB2WC
2863 {
2864 #if wxUSE_FONTMAP
2865 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2866 : new wxMBConv_win32(m_encoding);
2867 if ( conv->IsOk() )
2868 return conv;
2869
2870 delete conv;
2871 #else
2872 return NULL;
2873 #endif
2874 }
2875 #endif // wxHAVE_WIN32_MB2WC
2876
2877 #ifdef __DARWIN__
2878 {
2879 // leave UTF16 and UTF32 to the built-ins of wx
2880 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2881 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2882 {
2883 #if wxUSE_FONTMAP
2884 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2885 : new wxMBConv_cf(m_encoding);
2886 #else
2887 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2888 #endif
2889
2890 if ( conv->IsOk() )
2891 return conv;
2892
2893 delete conv;
2894 }
2895 }
2896 #endif // __DARWIN__
2897
2898 // step (2)
2899 wxFontEncoding enc = m_encoding;
2900 #if wxUSE_FONTMAP
2901 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2902 {
2903 // use "false" to suppress interactive dialogs -- we can be called from
2904 // anywhere and popping up a dialog from here is the last thing we want to
2905 // do
2906 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2907 }
2908 #endif // wxUSE_FONTMAP
2909
2910 switch ( enc )
2911 {
2912 case wxFONTENCODING_UTF7:
2913 return new wxMBConvUTF7;
2914
2915 case wxFONTENCODING_UTF8:
2916 return new wxMBConvUTF8;
2917
2918 case wxFONTENCODING_UTF16BE:
2919 return new wxMBConvUTF16BE;
2920
2921 case wxFONTENCODING_UTF16LE:
2922 return new wxMBConvUTF16LE;
2923
2924 case wxFONTENCODING_UTF32BE:
2925 return new wxMBConvUTF32BE;
2926
2927 case wxFONTENCODING_UTF32LE:
2928 return new wxMBConvUTF32LE;
2929
2930 default:
2931 // nothing to do but put here to suppress gcc warnings
2932 break;
2933 }
2934
2935 // step (3)
2936 #if wxUSE_FONTMAP
2937 {
2938 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2939 : new wxMBConv_wxwin(m_encoding);
2940 if ( conv->IsOk() )
2941 return conv;
2942
2943 delete conv;
2944 }
2945 #endif // wxUSE_FONTMAP
2946
2947 // NB: This is a hack to prevent deadlock. What could otherwise happen
2948 // in Unicode build: wxConvLocal creation ends up being here
2949 // because of some failure and logs the error. But wxLog will try to
2950 // attach a timestamp, for which it will need wxConvLocal (to convert
2951 // time to char* and then wchar_t*), but that fails, tries to log the
2952 // error, but wxLog has an (already locked) critical section that
2953 // guards the static buffer.
2954 static bool alreadyLoggingError = false;
2955 if (!alreadyLoggingError)
2956 {
2957 alreadyLoggingError = true;
2958 wxLogError(_("Cannot convert from the charset '%s'!"),
2959 m_name ? m_name
2960 :
2961 #if wxUSE_FONTMAP
2962 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2963 #else // !wxUSE_FONTMAP
2964 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2965 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2966 );
2967
2968 alreadyLoggingError = false;
2969 }
2970
2971 return NULL;
2972 }
2973
2974 void wxCSConv::CreateConvIfNeeded() const
2975 {
2976 if ( m_deferred )
2977 {
2978 wxCSConv *self = (wxCSConv *)this; // const_cast
2979
2980 // if we don't have neither the name nor the encoding, use the default
2981 // encoding for this system
2982 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2983 {
2984 #if wxUSE_INTL
2985 self->m_encoding = wxLocale::GetSystemEncoding();
2986 #else
2987 // fallback to some reasonable default:
2988 self->m_encoding = wxFONTENCODING_ISO8859_1;
2989 #endif // wxUSE_INTL
2990 }
2991
2992 self->m_convReal = DoCreate();
2993 self->m_deferred = false;
2994 }
2995 }
2996
2997 bool wxCSConv::IsOk() const
2998 {
2999 CreateConvIfNeeded();
3000
3001 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3002 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3003 return true; // always ok as we do it ourselves
3004
3005 // m_convReal->IsOk() is called at its own creation, so we know it must
3006 // be ok if m_convReal is non-NULL
3007 return m_convReal != NULL;
3008 }
3009
3010 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3011 const char *src, size_t srcLen) const
3012 {
3013 CreateConvIfNeeded();
3014
3015 if (m_convReal)
3016 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3017
3018 // latin-1 (direct)
3019 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3020 }
3021
3022 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3023 const wchar_t *src, size_t srcLen) const
3024 {
3025 CreateConvIfNeeded();
3026
3027 if (m_convReal)
3028 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3029
3030 // latin-1 (direct)
3031 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3032 }
3033
3034 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3035 {
3036 CreateConvIfNeeded();
3037
3038 if (m_convReal)
3039 return m_convReal->MB2WC(buf, psz, n);
3040
3041 // latin-1 (direct)
3042 size_t len = strlen(psz);
3043
3044 if (buf)
3045 {
3046 for (size_t c = 0; c <= len; c++)
3047 buf[c] = (unsigned char)(psz[c]);
3048 }
3049
3050 return len;
3051 }
3052
3053 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3054 {
3055 CreateConvIfNeeded();
3056
3057 if (m_convReal)
3058 return m_convReal->WC2MB(buf, psz, n);
3059
3060 // latin-1 (direct)
3061 const size_t len = wxWcslen(psz);
3062 if (buf)
3063 {
3064 for (size_t c = 0; c <= len; c++)
3065 {
3066 if (psz[c] > 0xFF)
3067 return wxCONV_FAILED;
3068
3069 buf[c] = (char)psz[c];
3070 }
3071 }
3072 else
3073 {
3074 for (size_t c = 0; c <= len; c++)
3075 {
3076 if (psz[c] > 0xFF)
3077 return wxCONV_FAILED;
3078 }
3079 }
3080
3081 return len;
3082 }
3083
3084 size_t wxCSConv::GetMBNulLen() const
3085 {
3086 CreateConvIfNeeded();
3087
3088 if ( m_convReal )
3089 {
3090 return m_convReal->GetMBNulLen();
3091 }
3092
3093 // otherwise, we are ISO-8859-1
3094 return 1;
3095 }
3096
3097 #if wxUSE_UNICODE_UTF8
3098 bool wxCSConv::IsUTF8() const
3099 {
3100 CreateConvIfNeeded();
3101
3102 if ( m_convReal )
3103 {
3104 return m_convReal->IsUTF8();
3105 }
3106
3107 // otherwise, we are ISO-8859-1
3108 return false;
3109 }
3110 #endif
3111
3112
3113 #if wxUSE_UNICODE
3114
3115 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3116 {
3117 if ( !s )
3118 return wxWCharBuffer();
3119
3120 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3121 if ( !wbuf )
3122 wbuf = wxMBConvUTF8().cMB2WX(s);
3123 if ( !wbuf )
3124 wbuf = wxConvISO8859_1.cMB2WX(s);
3125
3126 return wbuf;
3127 }
3128
3129 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3130 {
3131 if ( !ws )
3132 return wxCharBuffer();
3133
3134 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3135 if ( !buf )
3136 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3137
3138 return buf;
3139 }
3140
3141 #endif // wxUSE_UNICODE
3142
3143 // ----------------------------------------------------------------------------
3144 // globals
3145 // ----------------------------------------------------------------------------
3146
3147 // NB: The reason why we create converted objects in this convoluted way,
3148 // using a factory function instead of global variable, is that they
3149 // may be used at static initialization time (some of them are used by
3150 // wxString ctors and there may be a global wxString object). In other
3151 // words, possibly _before_ the converter global object would be
3152 // initialized.
3153
3154 #undef wxConvLibc
3155 #undef wxConvUTF8
3156 #undef wxConvUTF7
3157 #undef wxConvLocal
3158 #undef wxConvISO8859_1
3159
3160 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3161 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3162 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3163 { \
3164 static impl_klass name##Obj ctor_args; \
3165 return &name##Obj; \
3166 } \
3167 /* this ensures that all global converter objects are created */ \
3168 /* by the time static initialization is done, i.e. before any */ \
3169 /* thread is launched: */ \
3170 static klass* gs_##name##instance = wxGet_##name##Ptr()
3171
3172 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3173 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3174
3175 #ifdef __WINDOWS__
3176 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3177 #else
3178 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3179 #endif
3180
3181 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3182 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3183
3184 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3185 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3186
3187 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3188 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3189
3190 #ifdef __DARWIN__
3191 // The xnu kernel always communicates file paths in decomposed UTF-8.
3192 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3193 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3194 #endif
3195
3196 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3197 #ifdef __DARWIN__
3198 &wxConvMacUTF8DObj;
3199 #else // !__DARWIN__
3200 wxGet_wxConvLibcPtr();
3201 #endif // __DARWIN__/!__DARWIN__
3202
3203 #else // !wxUSE_WCHAR_T
3204
3205 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3206 // stand-ins in absence of wchar_t
3207 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3208 wxConvISO8859_1,
3209 wxConvLocal,
3210 wxConvUTF8;
3211
3212 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T