implement wxMBConv_iconv::To/FromWChar() instead of MB2WC/WC2MB: this allows to use...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __DARWIN__
56 #include "wx/mac/corefoundation/private/strconv_cf.h"
57 #endif //def __DARWIN__
58
59
60 #define TRACE_STRCONV _T("strconv")
61
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63 // be 4 bytes
64 #if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66 #endif
67
68
69 // ============================================================================
70 // implementation
71 // ============================================================================
72
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p, size_t n)
75 {
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80 }
81
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
85
86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87 {
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109 }
110
111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112 {
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128 }
129
130 #ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132 #else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134 #endif // WC_UTF16/!WC_UTF16
135
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
138 //
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
140 // check for this
141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142 {
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152 }
153
154 // ----------------------------------------------------------------------------
155 // wxMBConv
156 // ----------------------------------------------------------------------------
157
158 size_t
159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161 {
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten = 0;
171
172 // the number of NULs terminating this string
173 size_t nulLen = 0; // not really needed, but just to avoid warnings
174
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
178 // NULs at the end
179 wxCharBuffer bufTmp;
180 const char *srcEnd;
181 if ( srcLen != wxNO_LEN )
182 {
183 // we need to know how to find the end of this string
184 nulLen = GetMBNulLen();
185 if ( nulLen == wxCONV_FAILED )
186 return wxCONV_FAILED;
187
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
190 {
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
193 char * const p = bufTmp.data();
194 memcpy(p, src, srcLen);
195 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
196 *s = '\0';
197
198 src = bufTmp;
199 }
200
201 srcEnd = src + srcLen;
202 }
203 else // quit after the first loop iteration
204 {
205 srcEnd = NULL;
206 }
207
208 for ( ;; )
209 {
210 // try to convert the current chunk
211 size_t lenChunk = MB2WC(NULL, src, 0);
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
214
215 lenChunk++; // for the L'\0' at the end of this chunk
216
217 dstWritten += lenChunk;
218
219 if ( lenChunk == 1 )
220 {
221 // nothing left in the input string, conversion succeeded
222 break;
223 }
224
225 if ( dst )
226 {
227 if ( dstWritten > dstLen )
228 return wxCONV_FAILED;
229
230 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
231 return wxCONV_FAILED;
232
233 dst += lenChunk;
234 }
235
236 if ( !srcEnd )
237 {
238 // we convert just one chunk in this case as this is the entire
239 // string anyhow
240 break;
241 }
242
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src, nulLen) )
245 {
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
250 src += nulLen;
251 }
252
253 src += nulLen; // skipping over its terminator as well
254
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
258 if ( src >= srcEnd )
259 break;
260 }
261
262 return dstWritten;
263 }
264
265 size_t
266 wxMBConv::FromWChar(char *dst, size_t dstLen,
267 const wchar_t *src, size_t srcLen) const
268 {
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten = 0;
271
272 // make a copy of the input string unless it is already properly
273 // NUL-terminated
274 //
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp;
278 if ( srcLen == wxNO_LEN )
279 {
280 srcLen = wxWcslen(src) + 1;
281 }
282 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
283 {
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp = wxWCharBuffer(srcLen);
286 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
287 src = bufTmp;
288 }
289
290 const size_t lenNul = GetMBNulLen();
291 for ( const wchar_t * const srcEnd = src + srcLen;
292 src < srcEnd;
293 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
294 {
295 // try to convert the current chunk
296 size_t lenChunk = WC2MB(NULL, src, 0);
297
298 if ( lenChunk == wxCONV_FAILED )
299 return wxCONV_FAILED;
300
301 lenChunk += lenNul;
302 dstWritten += lenChunk;
303
304 if ( dst )
305 {
306 if ( dstWritten > dstLen )
307 return wxCONV_FAILED;
308
309 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
310 return wxCONV_FAILED;
311
312 dst += lenChunk;
313 }
314 }
315
316 return dstWritten;
317 }
318
319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
320 {
321 size_t rc = ToWChar(outBuff, outLen, inBuff);
322 if ( rc != wxCONV_FAILED )
323 {
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
326 rc--;
327 }
328
329 return rc;
330 }
331
332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
333 {
334 size_t rc = FromWChar(outBuff, outLen, inBuff);
335 if ( rc != wxCONV_FAILED )
336 {
337 rc -= GetMBNulLen();
338 }
339
340 return rc;
341 }
342
343 wxMBConv::~wxMBConv()
344 {
345 // nothing to do here (necessary for Darwin linking probably)
346 }
347
348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
349 {
350 if ( psz )
351 {
352 // calculate the length of the buffer needed first
353 const size_t nLen = ToWChar(NULL, 0, psz);
354 if ( nLen != wxCONV_FAILED )
355 {
356 // now do the actual conversion
357 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
358
359 // +1 for the trailing NULL
360 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
361 return buf;
362 }
363 }
364
365 return wxWCharBuffer();
366 }
367
368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
369 {
370 if ( pwz )
371 {
372 const size_t nLen = FromWChar(NULL, 0, pwz);
373 if ( nLen != wxCONV_FAILED )
374 {
375 wxCharBuffer buf(nLen - 1);
376 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
377 return buf;
378 }
379 }
380
381 return wxCharBuffer();
382 }
383
384 const wxWCharBuffer
385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
386 {
387 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
388 if ( dstLen != wxCONV_FAILED )
389 {
390 // notice that we allocate space for dstLen+1 wide characters here
391 // because we want the buffer to always be NUL-terminated, even if the
392 // input isn't (as otherwise the caller has no way to know its length)
393 wxWCharBuffer wbuf(dstLen);
394 wbuf.data()[dstLen - 1] = L'\0';
395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
396 {
397 if ( outLen )
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412 }
413
414 const wxCharBuffer
415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
416 {
417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
418 if ( dstLen != wxCONV_FAILED )
419 {
420 const size_t nulLen = GetMBNulLen();
421
422 // as above, ensure that the buffer is always NUL-terminated, even if
423 // the input is not
424 wxCharBuffer buf(dstLen + nulLen - 1);
425 memset(buf.data() + dstLen, 0, nulLen);
426 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
427 {
428 if ( outLen )
429 {
430 *outLen = dstLen;
431
432 if ( dstLen >= nulLen &&
433 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
434 {
435 // in this case the output is NUL-terminated and we're not
436 // supposed to count NUL
437 *outLen -= nulLen;
438 }
439 }
440
441 return buf;
442 }
443 }
444
445 if ( outLen )
446 *outLen = 0;
447
448 return wxCharBuffer();
449 }
450
451 // ----------------------------------------------------------------------------
452 // wxMBConvLibc
453 // ----------------------------------------------------------------------------
454
455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
456 {
457 return wxMB2WC(buf, psz, n);
458 }
459
460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
461 {
462 return wxWC2MB(buf, psz, n);
463 }
464
465 // ----------------------------------------------------------------------------
466 // wxConvBrokenFileNames
467 // ----------------------------------------------------------------------------
468
469 #ifdef __UNIX__
470
471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
472 {
473 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
474 wxStricmp(charset, _T("UTF8")) == 0 )
475 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
476 else
477 m_conv = new wxCSConv(charset);
478 }
479
480 #endif // __UNIX__
481
482 // ----------------------------------------------------------------------------
483 // UTF-7
484 // ----------------------------------------------------------------------------
485
486 // Implementation (C) 2004 Fredrik Roubert
487
488 //
489 // BASE64 decoding table
490 //
491 static const unsigned char utf7unb64[] =
492 {
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
499 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
500 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
502 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
503 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
504 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
506 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
507 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
508 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
525 };
526
527 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
528 {
529 size_t len = 0;
530
531 while ( *psz && (!buf || (len < n)) )
532 {
533 unsigned char cc = *psz++;
534 if (cc != '+')
535 {
536 // plain ASCII char
537 if (buf)
538 *buf++ = cc;
539 len++;
540 }
541 else if (*psz == '-')
542 {
543 // encoded plus sign
544 if (buf)
545 *buf++ = cc;
546 len++;
547 psz++;
548 }
549 else // start of BASE64 encoded string
550 {
551 bool lsb, ok;
552 unsigned int d, l;
553 for ( ok = lsb = false, d = 0, l = 0;
554 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
555 psz++ )
556 {
557 d <<= 6;
558 d += cc;
559 for (l += 6; l >= 8; lsb = !lsb)
560 {
561 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
562 if (lsb)
563 {
564 if (buf)
565 *buf++ |= c;
566 len ++;
567 }
568 else
569 {
570 if (buf)
571 *buf = (wchar_t)(c << 8);
572 }
573
574 ok = true;
575 }
576 }
577
578 if ( !ok )
579 {
580 // in valid UTF7 we should have valid characters after '+'
581 return wxCONV_FAILED;
582 }
583
584 if (*psz == '-')
585 psz++;
586 }
587 }
588
589 if ( buf && (len < n) )
590 *buf = '\0';
591
592 return len;
593 }
594
595 //
596 // BASE64 encoding table
597 //
598 static const unsigned char utf7enb64[] =
599 {
600 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
601 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
602 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
603 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
604 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
605 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
606 'w', 'x', 'y', 'z', '0', '1', '2', '3',
607 '4', '5', '6', '7', '8', '9', '+', '/'
608 };
609
610 //
611 // UTF-7 encoding table
612 //
613 // 0 - Set D (directly encoded characters)
614 // 1 - Set O (optional direct characters)
615 // 2 - whitespace characters (optional)
616 // 3 - special characters
617 //
618 static const unsigned char utf7encode[128] =
619 {
620 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
621 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
622 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
624 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
626 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
628 };
629
630 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
631 {
632 size_t len = 0;
633
634 while (*psz && ((!buf) || (len < n)))
635 {
636 wchar_t cc = *psz++;
637 if (cc < 0x80 && utf7encode[cc] < 1)
638 {
639 // plain ASCII char
640 if (buf)
641 *buf++ = (char)cc;
642
643 len++;
644 }
645 #ifndef WC_UTF16
646 else if (((wxUint32)cc) > 0xffff)
647 {
648 // no surrogate pair generation (yet?)
649 return wxCONV_FAILED;
650 }
651 #endif
652 else
653 {
654 if (buf)
655 *buf++ = '+';
656
657 len++;
658 if (cc != '+')
659 {
660 // BASE64 encode string
661 unsigned int lsb, d, l;
662 for (d = 0, l = 0; /*nothing*/; psz++)
663 {
664 for (lsb = 0; lsb < 2; lsb ++)
665 {
666 d <<= 8;
667 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
668
669 for (l += 8; l >= 6; )
670 {
671 l -= 6;
672 if (buf)
673 *buf++ = utf7enb64[(d >> l) % 64];
674 len++;
675 }
676 }
677
678 cc = *psz;
679 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
680 break;
681 }
682
683 if (l != 0)
684 {
685 if (buf)
686 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
687
688 len++;
689 }
690 }
691
692 if (buf)
693 *buf++ = '-';
694 len++;
695 }
696 }
697
698 if (buf && (len < n))
699 *buf = 0;
700
701 return len;
702 }
703
704 // ----------------------------------------------------------------------------
705 // UTF-8
706 // ----------------------------------------------------------------------------
707
708 static const wxUint32 utf8_max[]=
709 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
710
711 // boundaries of the private use area we use to (temporarily) remap invalid
712 // characters invalid in a UTF-8 encoded string
713 const wxUint32 wxUnicodePUA = 0x100000;
714 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
715
716 // this table gives the length of the UTF-8 encoding from its first character:
717 const unsigned char tableUtf8Lengths[256] = {
718 // single-byte sequences (ASCII):
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
726 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
727
728 // these are invalid:
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
733 0, 0, // C0,C1
734
735 // two-byte sequences:
736 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
737 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
738
739 // three-byte sequences:
740 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
741
742 // four-byte sequences:
743 4, 4, 4, 4, 4, // F0..F4
744
745 // these are invalid again (5- or 6-byte
746 // sequences and sequences for code points
747 // above U+10FFFF, as restricted by RFC 3629):
748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
749 };
750
751 size_t
752 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
753 const char *src, size_t srcLen) const
754 {
755 wchar_t *out = dstLen ? dst : NULL;
756 size_t written = 0;
757
758 if ( srcLen == wxNO_LEN )
759 srcLen = strlen(src) + 1;
760
761 for ( const char *p = src; ; p++ )
762 {
763 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
764 {
765 // all done successfully, just add the trailing NULL if we are not
766 // using explicit length
767 if ( srcLen == wxNO_LEN )
768 {
769 if ( out )
770 {
771 if ( !dstLen )
772 break;
773
774 *out = L'\0';
775 }
776
777 written++;
778 }
779
780 return written;
781 }
782
783 if ( out && !dstLen-- )
784 break;
785
786 wxUint32 code;
787 unsigned char c = *p;
788
789 if ( c < 0x80 )
790 {
791 if ( srcLen == 0 ) // the test works for wxNO_LEN too
792 break;
793
794 if ( srcLen != wxNO_LEN )
795 srcLen--;
796
797 code = c;
798 }
799 else
800 {
801 unsigned len = tableUtf8Lengths[c];
802 if ( !len )
803 break;
804
805 if ( srcLen < len ) // the test works for wxNO_LEN too
806 break;
807
808 if ( srcLen != wxNO_LEN )
809 srcLen -= len;
810
811 // Char. number range | UTF-8 octet sequence
812 // (hexadecimal) | (binary)
813 // ----------------------+----------------------------------------
814 // 0000 0000 - 0000 007F | 0xxxxxxx
815 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
816 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
817 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
818 //
819 // Code point value is stored in bits marked with 'x',
820 // lowest-order bit of the value on the right side in the diagram
821 // above. (from RFC 3629)
822
823 // mask to extract lead byte's value ('x' bits above), by sequence
824 // length:
825 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
826
827 // mask and value of lead byte's most significant bits, by length:
828 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
829 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
830
831 len--; // it's more convenient to work with 0-based length here
832
833 // extract the lead byte's value bits:
834 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
835 break;
836
837 code = c & leadValueMask[len];
838
839 // all remaining bytes, if any, are handled in the same way
840 // regardless of sequence's length:
841 for ( ; len; --len )
842 {
843 c = *++p;
844 if ( (c & 0xC0) != 0x80 )
845 return wxCONV_FAILED;
846
847 code <<= 6;
848 code |= c & 0x3F;
849 }
850 }
851
852 #ifdef WC_UTF16
853 // cast is ok because wchar_t == wxUint16 if WC_UTF16
854 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
855 {
856 if ( out )
857 out++;
858 written++;
859 }
860 #else // !WC_UTF16
861 if ( out )
862 *out = code;
863 #endif // WC_UTF16/!WC_UTF16
864
865 if ( out )
866 out++;
867
868 written++;
869 }
870
871 return wxCONV_FAILED;
872 }
873
874 size_t
875 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
876 const wchar_t *src, size_t srcLen) const
877 {
878 char *out = dstLen ? dst : NULL;
879 size_t written = 0;
880
881 for ( const wchar_t *wp = src; ; wp++ )
882 {
883 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
884 {
885 // all done successfully, just add the trailing NULL if we are not
886 // using explicit length
887 if ( srcLen == wxNO_LEN )
888 {
889 if ( out )
890 {
891 if ( !dstLen )
892 break;
893
894 *out = '\0';
895 }
896
897 written++;
898 }
899
900 return written;
901 }
902
903
904 wxUint32 code;
905 #ifdef WC_UTF16
906 // cast is ok for WC_UTF16
907 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
908 {
909 // skip the next char too as we decoded a surrogate
910 wp++;
911 }
912 #else // wchar_t is UTF-32
913 code = *wp & 0x7fffffff;
914 #endif
915
916 unsigned len;
917 if ( code <= 0x7F )
918 {
919 len = 1;
920 if ( out )
921 {
922 if ( dstLen < len )
923 break;
924
925 out[0] = (char)code;
926 }
927 }
928 else if ( code <= 0x07FF )
929 {
930 len = 2;
931 if ( out )
932 {
933 if ( dstLen < len )
934 break;
935
936 // NB: this line takes 6 least significant bits, encodes them as
937 // 10xxxxxx and discards them so that the next byte can be encoded:
938 out[1] = 0x80 | (code & 0x3F); code >>= 6;
939 out[0] = 0xC0 | code;
940 }
941 }
942 else if ( code < 0xFFFF )
943 {
944 len = 3;
945 if ( out )
946 {
947 if ( dstLen < len )
948 break;
949
950 out[2] = 0x80 | (code & 0x3F); code >>= 6;
951 out[1] = 0x80 | (code & 0x3F); code >>= 6;
952 out[0] = 0xE0 | code;
953 }
954 }
955 else if ( code <= 0x10FFFF )
956 {
957 len = 4;
958 if ( out )
959 {
960 if ( dstLen < len )
961 break;
962
963 out[3] = 0x80 | (code & 0x3F); code >>= 6;
964 out[2] = 0x80 | (code & 0x3F); code >>= 6;
965 out[1] = 0x80 | (code & 0x3F); code >>= 6;
966 out[0] = 0xF0 | code;
967 }
968 }
969 else
970 {
971 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
972 break;
973 }
974
975 if ( out )
976 {
977 out += len;
978 dstLen -= len;
979 }
980
981 written += len;
982 }
983
984 // we only get here if an error occurs during decoding
985 return wxCONV_FAILED;
986 }
987
988 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
989 const char *psz, size_t srcLen) const
990 {
991 if ( m_options == MAP_INVALID_UTF8_NOT )
992 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
993
994 size_t len = 0;
995
996 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
997 {
998 const char *opsz = psz;
999 bool invalid = false;
1000 unsigned char cc = *psz++, fc = cc;
1001 unsigned cnt;
1002 for (cnt = 0; fc & 0x80; cnt++)
1003 fc <<= 1;
1004
1005 if (!cnt)
1006 {
1007 // plain ASCII char
1008 if (buf)
1009 *buf++ = cc;
1010 len++;
1011
1012 // escape the escape character for octal escapes
1013 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1014 && cc == '\\' && (!buf || len < n))
1015 {
1016 if (buf)
1017 *buf++ = cc;
1018 len++;
1019 }
1020 }
1021 else
1022 {
1023 cnt--;
1024 if (!cnt)
1025 {
1026 // invalid UTF-8 sequence
1027 invalid = true;
1028 }
1029 else
1030 {
1031 unsigned ocnt = cnt - 1;
1032 wxUint32 res = cc & (0x3f >> cnt);
1033 while (cnt--)
1034 {
1035 cc = *psz;
1036 if ((cc & 0xC0) != 0x80)
1037 {
1038 // invalid UTF-8 sequence
1039 invalid = true;
1040 break;
1041 }
1042
1043 psz++;
1044 res = (res << 6) | (cc & 0x3f);
1045 }
1046
1047 if (invalid || res <= utf8_max[ocnt])
1048 {
1049 // illegal UTF-8 encoding
1050 invalid = true;
1051 }
1052 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1053 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1054 {
1055 // if one of our PUA characters turns up externally
1056 // it must also be treated as an illegal sequence
1057 // (a bit like you have to escape an escape character)
1058 invalid = true;
1059 }
1060 else
1061 {
1062 #ifdef WC_UTF16
1063 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1064 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1065 if (pa == wxCONV_FAILED)
1066 {
1067 invalid = true;
1068 }
1069 else
1070 {
1071 if (buf)
1072 buf += pa;
1073 len += pa;
1074 }
1075 #else // !WC_UTF16
1076 if (buf)
1077 *buf++ = (wchar_t)res;
1078 len++;
1079 #endif // WC_UTF16/!WC_UTF16
1080 }
1081 }
1082
1083 if (invalid)
1084 {
1085 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1086 {
1087 while (opsz < psz && (!buf || len < n))
1088 {
1089 #ifdef WC_UTF16
1090 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1091 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1092 wxASSERT(pa != wxCONV_FAILED);
1093 if (buf)
1094 buf += pa;
1095 opsz++;
1096 len += pa;
1097 #else
1098 if (buf)
1099 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1100 opsz++;
1101 len++;
1102 #endif
1103 }
1104 }
1105 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1106 {
1107 while (opsz < psz && (!buf || len < n))
1108 {
1109 if ( buf && len + 3 < n )
1110 {
1111 unsigned char on = *opsz;
1112 *buf++ = L'\\';
1113 *buf++ = (wchar_t)( L'0' + on / 0100 );
1114 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1115 *buf++ = (wchar_t)( L'0' + on % 010 );
1116 }
1117
1118 opsz++;
1119 len += 4;
1120 }
1121 }
1122 else // MAP_INVALID_UTF8_NOT
1123 {
1124 return wxCONV_FAILED;
1125 }
1126 }
1127 }
1128 }
1129
1130 if (srcLen == wxNO_LEN && buf && (len < n))
1131 *buf = 0;
1132
1133 return len + 1;
1134 }
1135
1136 static inline bool isoctal(wchar_t wch)
1137 {
1138 return L'0' <= wch && wch <= L'7';
1139 }
1140
1141 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1142 const wchar_t *psz, size_t srcLen) const
1143 {
1144 if ( m_options == MAP_INVALID_UTF8_NOT )
1145 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1146
1147 size_t len = 0;
1148
1149 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1150 {
1151 wxUint32 cc;
1152
1153 #ifdef WC_UTF16
1154 // cast is ok for WC_UTF16
1155 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1156 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1157 #else
1158 cc = (*psz++) & 0x7fffffff;
1159 #endif
1160
1161 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1162 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1163 {
1164 if (buf)
1165 *buf++ = (char)(cc - wxUnicodePUA);
1166 len++;
1167 }
1168 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1169 && cc == L'\\' && psz[0] == L'\\' )
1170 {
1171 if (buf)
1172 *buf++ = (char)cc;
1173 psz++;
1174 len++;
1175 }
1176 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1177 cc == L'\\' &&
1178 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1179 {
1180 if (buf)
1181 {
1182 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1183 (psz[1] - L'0') * 010 +
1184 (psz[2] - L'0'));
1185 }
1186
1187 psz += 3;
1188 len++;
1189 }
1190 else
1191 {
1192 unsigned cnt;
1193 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1194 {
1195 }
1196
1197 if (!cnt)
1198 {
1199 // plain ASCII char
1200 if (buf)
1201 *buf++ = (char) cc;
1202 len++;
1203 }
1204 else
1205 {
1206 len += cnt + 1;
1207 if (buf)
1208 {
1209 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1210 while (cnt--)
1211 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1212 }
1213 }
1214 }
1215 }
1216
1217 if (srcLen == wxNO_LEN && buf && (len < n))
1218 *buf = 0;
1219
1220 return len + 1;
1221 }
1222
1223 // ============================================================================
1224 // UTF-16
1225 // ============================================================================
1226
1227 #ifdef WORDS_BIGENDIAN
1228 #define wxMBConvUTF16straight wxMBConvUTF16BE
1229 #define wxMBConvUTF16swap wxMBConvUTF16LE
1230 #else
1231 #define wxMBConvUTF16swap wxMBConvUTF16BE
1232 #define wxMBConvUTF16straight wxMBConvUTF16LE
1233 #endif
1234
1235 /* static */
1236 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1237 {
1238 if ( srcLen == wxNO_LEN )
1239 {
1240 // count the number of bytes in input, including the trailing NULs
1241 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1242 for ( srcLen = 1; *inBuff++; srcLen++ )
1243 ;
1244
1245 srcLen *= BYTES_PER_CHAR;
1246 }
1247 else // we already have the length
1248 {
1249 // we can only convert an entire number of UTF-16 characters
1250 if ( srcLen % BYTES_PER_CHAR )
1251 return wxCONV_FAILED;
1252 }
1253
1254 return srcLen;
1255 }
1256
1257 // case when in-memory representation is UTF-16 too
1258 #ifdef WC_UTF16
1259
1260 // ----------------------------------------------------------------------------
1261 // conversions without endianness change
1262 // ----------------------------------------------------------------------------
1263
1264 size_t
1265 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1266 const char *src, size_t srcLen) const
1267 {
1268 // set up the scene for using memcpy() (which is presumably more efficient
1269 // than copying the bytes one by one)
1270 srcLen = GetLength(src, srcLen);
1271 if ( srcLen == wxNO_LEN )
1272 return wxCONV_FAILED;
1273
1274 const size_t inLen = srcLen / BYTES_PER_CHAR;
1275 if ( dst )
1276 {
1277 if ( dstLen < inLen )
1278 return wxCONV_FAILED;
1279
1280 memcpy(dst, src, srcLen);
1281 }
1282
1283 return inLen;
1284 }
1285
1286 size_t
1287 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1288 const wchar_t *src, size_t srcLen) const
1289 {
1290 if ( srcLen == wxNO_LEN )
1291 srcLen = wxWcslen(src) + 1;
1292
1293 srcLen *= BYTES_PER_CHAR;
1294
1295 if ( dst )
1296 {
1297 if ( dstLen < srcLen )
1298 return wxCONV_FAILED;
1299
1300 memcpy(dst, src, srcLen);
1301 }
1302
1303 return srcLen;
1304 }
1305
1306 // ----------------------------------------------------------------------------
1307 // endian-reversing conversions
1308 // ----------------------------------------------------------------------------
1309
1310 size_t
1311 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1312 const char *src, size_t srcLen) const
1313 {
1314 srcLen = GetLength(src, srcLen);
1315 if ( srcLen == wxNO_LEN )
1316 return wxCONV_FAILED;
1317
1318 srcLen /= BYTES_PER_CHAR;
1319
1320 if ( dst )
1321 {
1322 if ( dstLen < srcLen )
1323 return wxCONV_FAILED;
1324
1325 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1326 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1327 {
1328 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1329 }
1330 }
1331
1332 return srcLen;
1333 }
1334
1335 size_t
1336 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1337 const wchar_t *src, size_t srcLen) const
1338 {
1339 if ( srcLen == wxNO_LEN )
1340 srcLen = wxWcslen(src) + 1;
1341
1342 srcLen *= BYTES_PER_CHAR;
1343
1344 if ( dst )
1345 {
1346 if ( dstLen < srcLen )
1347 return wxCONV_FAILED;
1348
1349 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1350 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1351 {
1352 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1353 }
1354 }
1355
1356 return srcLen;
1357 }
1358
1359 #else // !WC_UTF16: wchar_t is UTF-32
1360
1361 // ----------------------------------------------------------------------------
1362 // conversions without endianness change
1363 // ----------------------------------------------------------------------------
1364
1365 size_t
1366 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1367 const char *src, size_t srcLen) const
1368 {
1369 srcLen = GetLength(src, srcLen);
1370 if ( srcLen == wxNO_LEN )
1371 return wxCONV_FAILED;
1372
1373 const size_t inLen = srcLen / BYTES_PER_CHAR;
1374 if ( !dst )
1375 {
1376 // optimization: return maximal space which could be needed for this
1377 // string even if the real size could be smaller if the buffer contains
1378 // any surrogates
1379 return inLen;
1380 }
1381
1382 size_t outLen = 0;
1383 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1384 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1385 {
1386 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1387 if ( !inBuff )
1388 return wxCONV_FAILED;
1389
1390 if ( ++outLen > dstLen )
1391 return wxCONV_FAILED;
1392
1393 *dst++ = ch;
1394 }
1395
1396
1397 return outLen;
1398 }
1399
1400 size_t
1401 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1402 const wchar_t *src, size_t srcLen) const
1403 {
1404 if ( srcLen == wxNO_LEN )
1405 srcLen = wxWcslen(src) + 1;
1406
1407 size_t outLen = 0;
1408 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1409 for ( size_t n = 0; n < srcLen; n++ )
1410 {
1411 wxUint16 cc[2];
1412 const size_t numChars = encode_utf16(*src++, cc);
1413 if ( numChars == wxCONV_FAILED )
1414 return wxCONV_FAILED;
1415
1416 outLen += numChars * BYTES_PER_CHAR;
1417 if ( outBuff )
1418 {
1419 if ( outLen > dstLen )
1420 return wxCONV_FAILED;
1421
1422 *outBuff++ = cc[0];
1423 if ( numChars == 2 )
1424 {
1425 // second character of a surrogate
1426 *outBuff++ = cc[1];
1427 }
1428 }
1429 }
1430
1431 return outLen;
1432 }
1433
1434 // ----------------------------------------------------------------------------
1435 // endian-reversing conversions
1436 // ----------------------------------------------------------------------------
1437
1438 size_t
1439 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1440 const char *src, size_t srcLen) const
1441 {
1442 srcLen = GetLength(src, srcLen);
1443 if ( srcLen == wxNO_LEN )
1444 return wxCONV_FAILED;
1445
1446 const size_t inLen = srcLen / BYTES_PER_CHAR;
1447 if ( !dst )
1448 {
1449 // optimization: return maximal space which could be needed for this
1450 // string even if the real size could be smaller if the buffer contains
1451 // any surrogates
1452 return inLen;
1453 }
1454
1455 size_t outLen = 0;
1456 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1457 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1458 {
1459 wxUint32 ch;
1460 wxUint16 tmp[2];
1461
1462 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1463 inBuff++;
1464 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1465
1466 const size_t numChars = decode_utf16(tmp, ch);
1467 if ( numChars == wxCONV_FAILED )
1468 return wxCONV_FAILED;
1469
1470 if ( numChars == 2 )
1471 inBuff++;
1472
1473 if ( ++outLen > dstLen )
1474 return wxCONV_FAILED;
1475
1476 *dst++ = ch;
1477 }
1478
1479
1480 return outLen;
1481 }
1482
1483 size_t
1484 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1485 const wchar_t *src, size_t srcLen) const
1486 {
1487 if ( srcLen == wxNO_LEN )
1488 srcLen = wxWcslen(src) + 1;
1489
1490 size_t outLen = 0;
1491 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1492 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1493 {
1494 wxUint16 cc[2];
1495 const size_t numChars = encode_utf16(*src, cc);
1496 if ( numChars == wxCONV_FAILED )
1497 return wxCONV_FAILED;
1498
1499 outLen += numChars * BYTES_PER_CHAR;
1500 if ( outBuff )
1501 {
1502 if ( outLen > dstLen )
1503 return wxCONV_FAILED;
1504
1505 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1506 if ( numChars == 2 )
1507 {
1508 // second character of a surrogate
1509 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1510 }
1511 }
1512 }
1513
1514 return outLen;
1515 }
1516
1517 #endif // WC_UTF16/!WC_UTF16
1518
1519
1520 // ============================================================================
1521 // UTF-32
1522 // ============================================================================
1523
1524 #ifdef WORDS_BIGENDIAN
1525 #define wxMBConvUTF32straight wxMBConvUTF32BE
1526 #define wxMBConvUTF32swap wxMBConvUTF32LE
1527 #else
1528 #define wxMBConvUTF32swap wxMBConvUTF32BE
1529 #define wxMBConvUTF32straight wxMBConvUTF32LE
1530 #endif
1531
1532
1533 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1534 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1535
1536 /* static */
1537 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1538 {
1539 if ( srcLen == wxNO_LEN )
1540 {
1541 // count the number of bytes in input, including the trailing NULs
1542 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1543 for ( srcLen = 1; *inBuff++; srcLen++ )
1544 ;
1545
1546 srcLen *= BYTES_PER_CHAR;
1547 }
1548 else // we already have the length
1549 {
1550 // we can only convert an entire number of UTF-32 characters
1551 if ( srcLen % BYTES_PER_CHAR )
1552 return wxCONV_FAILED;
1553 }
1554
1555 return srcLen;
1556 }
1557
1558 // case when in-memory representation is UTF-16
1559 #ifdef WC_UTF16
1560
1561 // ----------------------------------------------------------------------------
1562 // conversions without endianness change
1563 // ----------------------------------------------------------------------------
1564
1565 size_t
1566 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1567 const char *src, size_t srcLen) const
1568 {
1569 srcLen = GetLength(src, srcLen);
1570 if ( srcLen == wxNO_LEN )
1571 return wxCONV_FAILED;
1572
1573 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1574 const size_t inLen = srcLen / BYTES_PER_CHAR;
1575 size_t outLen = 0;
1576 for ( size_t n = 0; n < inLen; n++ )
1577 {
1578 wxUint16 cc[2];
1579 const size_t numChars = encode_utf16(*inBuff++, cc);
1580 if ( numChars == wxCONV_FAILED )
1581 return wxCONV_FAILED;
1582
1583 outLen += numChars;
1584 if ( dst )
1585 {
1586 if ( outLen > dstLen )
1587 return wxCONV_FAILED;
1588
1589 *dst++ = cc[0];
1590 if ( numChars == 2 )
1591 {
1592 // second character of a surrogate
1593 *dst++ = cc[1];
1594 }
1595 }
1596 }
1597
1598 return outLen;
1599 }
1600
1601 size_t
1602 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1603 const wchar_t *src, size_t srcLen) const
1604 {
1605 if ( srcLen == wxNO_LEN )
1606 srcLen = wxWcslen(src) + 1;
1607
1608 if ( !dst )
1609 {
1610 // optimization: return maximal space which could be needed for this
1611 // string instead of the exact amount which could be less if there are
1612 // any surrogates in the input
1613 //
1614 // we consider that surrogates are rare enough to make it worthwhile to
1615 // avoid running the loop below at the cost of slightly extra memory
1616 // consumption
1617 return srcLen * BYTES_PER_CHAR;
1618 }
1619
1620 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1621 size_t outLen = 0;
1622 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1623 {
1624 const wxUint32 ch = wxDecodeSurrogate(&src);
1625 if ( !src )
1626 return wxCONV_FAILED;
1627
1628 outLen += BYTES_PER_CHAR;
1629
1630 if ( outLen > dstLen )
1631 return wxCONV_FAILED;
1632
1633 *outBuff++ = ch;
1634 }
1635
1636 return outLen;
1637 }
1638
1639 // ----------------------------------------------------------------------------
1640 // endian-reversing conversions
1641 // ----------------------------------------------------------------------------
1642
1643 size_t
1644 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1645 const char *src, size_t srcLen) const
1646 {
1647 srcLen = GetLength(src, srcLen);
1648 if ( srcLen == wxNO_LEN )
1649 return wxCONV_FAILED;
1650
1651 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1652 const size_t inLen = srcLen / BYTES_PER_CHAR;
1653 size_t outLen = 0;
1654 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1655 {
1656 wxUint16 cc[2];
1657 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1658 if ( numChars == wxCONV_FAILED )
1659 return wxCONV_FAILED;
1660
1661 outLen += numChars;
1662 if ( dst )
1663 {
1664 if ( outLen > dstLen )
1665 return wxCONV_FAILED;
1666
1667 *dst++ = cc[0];
1668 if ( numChars == 2 )
1669 {
1670 // second character of a surrogate
1671 *dst++ = cc[1];
1672 }
1673 }
1674 }
1675
1676 return outLen;
1677 }
1678
1679 size_t
1680 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1681 const wchar_t *src, size_t srcLen) const
1682 {
1683 if ( srcLen == wxNO_LEN )
1684 srcLen = wxWcslen(src) + 1;
1685
1686 if ( !dst )
1687 {
1688 // optimization: return maximal space which could be needed for this
1689 // string instead of the exact amount which could be less if there are
1690 // any surrogates in the input
1691 //
1692 // we consider that surrogates are rare enough to make it worthwhile to
1693 // avoid running the loop below at the cost of slightly extra memory
1694 // consumption
1695 return srcLen*BYTES_PER_CHAR;
1696 }
1697
1698 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1699 size_t outLen = 0;
1700 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1701 {
1702 const wxUint32 ch = wxDecodeSurrogate(&src);
1703 if ( !src )
1704 return wxCONV_FAILED;
1705
1706 outLen += BYTES_PER_CHAR;
1707
1708 if ( outLen > dstLen )
1709 return wxCONV_FAILED;
1710
1711 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1712 }
1713
1714 return outLen;
1715 }
1716
1717 #else // !WC_UTF16: wchar_t is UTF-32
1718
1719 // ----------------------------------------------------------------------------
1720 // conversions without endianness change
1721 // ----------------------------------------------------------------------------
1722
1723 size_t
1724 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1725 const char *src, size_t srcLen) const
1726 {
1727 // use memcpy() as it should be much faster than hand-written loop
1728 srcLen = GetLength(src, srcLen);
1729 if ( srcLen == wxNO_LEN )
1730 return wxCONV_FAILED;
1731
1732 const size_t inLen = srcLen/BYTES_PER_CHAR;
1733 if ( dst )
1734 {
1735 if ( dstLen < inLen )
1736 return wxCONV_FAILED;
1737
1738 memcpy(dst, src, srcLen);
1739 }
1740
1741 return inLen;
1742 }
1743
1744 size_t
1745 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1746 const wchar_t *src, size_t srcLen) const
1747 {
1748 if ( srcLen == wxNO_LEN )
1749 srcLen = wxWcslen(src) + 1;
1750
1751 srcLen *= BYTES_PER_CHAR;
1752
1753 if ( dst )
1754 {
1755 if ( dstLen < srcLen )
1756 return wxCONV_FAILED;
1757
1758 memcpy(dst, src, srcLen);
1759 }
1760
1761 return srcLen;
1762 }
1763
1764 // ----------------------------------------------------------------------------
1765 // endian-reversing conversions
1766 // ----------------------------------------------------------------------------
1767
1768 size_t
1769 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1770 const char *src, size_t srcLen) const
1771 {
1772 srcLen = GetLength(src, srcLen);
1773 if ( srcLen == wxNO_LEN )
1774 return wxCONV_FAILED;
1775
1776 srcLen /= BYTES_PER_CHAR;
1777
1778 if ( dst )
1779 {
1780 if ( dstLen < srcLen )
1781 return wxCONV_FAILED;
1782
1783 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1784 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1785 {
1786 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1787 }
1788 }
1789
1790 return srcLen;
1791 }
1792
1793 size_t
1794 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1795 const wchar_t *src, size_t srcLen) const
1796 {
1797 if ( srcLen == wxNO_LEN )
1798 srcLen = wxWcslen(src) + 1;
1799
1800 srcLen *= BYTES_PER_CHAR;
1801
1802 if ( dst )
1803 {
1804 if ( dstLen < srcLen )
1805 return wxCONV_FAILED;
1806
1807 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1808 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1809 {
1810 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1811 }
1812 }
1813
1814 return srcLen;
1815 }
1816
1817 #endif // WC_UTF16/!WC_UTF16
1818
1819
1820 // ============================================================================
1821 // The classes doing conversion using the iconv_xxx() functions
1822 // ============================================================================
1823
1824 #ifdef HAVE_ICONV
1825
1826 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1827 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1828 // (unless there's yet another bug in glibc) the only case when iconv()
1829 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1830 // left in the input buffer -- when _real_ error occurs,
1831 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1832 // iconv() failure.
1833 // [This bug does not appear in glibc 2.2.]
1834 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1835 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1836 (errno != E2BIG || bufLeft != 0))
1837 #else
1838 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1839 #endif
1840
1841 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1842
1843 #define ICONV_T_INVALID ((iconv_t)-1)
1844
1845 #if SIZEOF_WCHAR_T == 4
1846 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1847 #define WC_ENC wxFONTENCODING_UTF32
1848 #elif SIZEOF_WCHAR_T == 2
1849 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1850 #define WC_ENC wxFONTENCODING_UTF16
1851 #else // sizeof(wchar_t) != 2 nor 4
1852 // does this ever happen?
1853 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1854 #endif
1855
1856 // ----------------------------------------------------------------------------
1857 // wxMBConv_iconv: encapsulates an iconv character set
1858 // ----------------------------------------------------------------------------
1859
1860 class wxMBConv_iconv : public wxMBConv
1861 {
1862 public:
1863 wxMBConv_iconv(const char *name);
1864 virtual ~wxMBConv_iconv();
1865
1866 // implement base class virtual methods
1867 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1868 const char *src, size_t srcLen = wxNO_LEN) const;
1869 virtual size_t FromWChar(char *dst, size_t dstLen,
1870 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1871 virtual size_t GetMBNulLen() const;
1872
1873 #if wxUSE_UNICODE_UTF8
1874 virtual bool IsUTF8() const;
1875 #endif
1876
1877 virtual wxMBConv *Clone() const
1878 {
1879 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1880 p->m_minMBCharWidth = m_minMBCharWidth;
1881 return p;
1882 }
1883
1884 bool IsOk() const
1885 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1886
1887 protected:
1888 // the iconv handlers used to translate from multibyte
1889 // to wide char and in the other direction
1890 iconv_t m2w,
1891 w2m;
1892
1893 #if wxUSE_THREADS
1894 // guards access to m2w and w2m objects
1895 wxMutex m_iconvMutex;
1896 #endif
1897
1898 private:
1899 // the name (for iconv_open()) of a wide char charset -- if none is
1900 // available on this machine, it will remain NULL
1901 static wxString ms_wcCharsetName;
1902
1903 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1904 // different endian-ness than the native one
1905 static bool ms_wcNeedsSwap;
1906
1907
1908 // name of the encoding handled by this conversion
1909 wxString m_name;
1910
1911 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1912 // initially
1913 size_t m_minMBCharWidth;
1914 };
1915
1916 // make the constructor available for unit testing
1917 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1918 {
1919 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1920 if ( !result->IsOk() )
1921 {
1922 delete result;
1923 return 0;
1924 }
1925
1926 return result;
1927 }
1928
1929 wxString wxMBConv_iconv::ms_wcCharsetName;
1930 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1931
1932 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1933 : m_name(name)
1934 {
1935 m_minMBCharWidth = 0;
1936
1937 // check for charset that represents wchar_t:
1938 if ( ms_wcCharsetName.empty() )
1939 {
1940 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1941
1942 #if wxUSE_FONTMAP
1943 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1944 #else // !wxUSE_FONTMAP
1945 static const wxChar *names_static[] =
1946 {
1947 #if SIZEOF_WCHAR_T == 4
1948 _T("UCS-4"),
1949 #elif SIZEOF_WCHAR_T = 2
1950 _T("UCS-2"),
1951 #endif
1952 NULL
1953 };
1954 const wxChar **names = names_static;
1955 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1956
1957 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1958 {
1959 const wxString nameCS(*names);
1960
1961 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1962 wxString nameXE(nameCS);
1963
1964 #ifdef WORDS_BIGENDIAN
1965 nameXE += _T("BE");
1966 #else // little endian
1967 nameXE += _T("LE");
1968 #endif
1969
1970 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1971 nameXE.c_str());
1972
1973 m2w = iconv_open(nameXE.ToAscii(), name);
1974 if ( m2w == ICONV_T_INVALID )
1975 {
1976 // try charset w/o bytesex info (e.g. "UCS4")
1977 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1978 nameCS.c_str());
1979 m2w = iconv_open(nameCS.ToAscii(), name);
1980
1981 // and check for bytesex ourselves:
1982 if ( m2w != ICONV_T_INVALID )
1983 {
1984 char buf[2], *bufPtr;
1985 wchar_t wbuf[2];
1986 size_t insz, outsz;
1987 size_t res;
1988
1989 buf[0] = 'A';
1990 buf[1] = 0;
1991 wbuf[0] = 0;
1992 insz = 2;
1993 outsz = SIZEOF_WCHAR_T * 2;
1994 char* wbufPtr = (char*)wbuf;
1995 bufPtr = buf;
1996
1997 res = iconv(
1998 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1999 &wbufPtr, &outsz);
2000
2001 if (ICONV_FAILED(res, insz))
2002 {
2003 wxLogLastError(wxT("iconv"));
2004 wxLogError(_("Conversion to charset '%s' doesn't work."),
2005 nameCS.c_str());
2006 }
2007 else // ok, can convert to this encoding, remember it
2008 {
2009 ms_wcCharsetName = nameCS;
2010 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2011 }
2012 }
2013 }
2014 else // use charset not requiring byte swapping
2015 {
2016 ms_wcCharsetName = nameXE;
2017 }
2018 }
2019
2020 wxLogTrace(TRACE_STRCONV,
2021 wxT("iconv wchar_t charset is \"%s\"%s"),
2022 ms_wcCharsetName.empty() ? wxString("<none>")
2023 : ms_wcCharsetName,
2024 ms_wcNeedsSwap ? _T(" (needs swap)")
2025 : _T(""));
2026 }
2027 else // we already have ms_wcCharsetName
2028 {
2029 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2030 }
2031
2032 if ( ms_wcCharsetName.empty() )
2033 {
2034 w2m = ICONV_T_INVALID;
2035 }
2036 else
2037 {
2038 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2039 if ( w2m == ICONV_T_INVALID )
2040 {
2041 wxLogTrace(TRACE_STRCONV,
2042 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2043 ms_wcCharsetName.c_str(), name);
2044 }
2045 }
2046 }
2047
2048 wxMBConv_iconv::~wxMBConv_iconv()
2049 {
2050 if ( m2w != ICONV_T_INVALID )
2051 iconv_close(m2w);
2052 if ( w2m != ICONV_T_INVALID )
2053 iconv_close(w2m);
2054 }
2055
2056 size_t
2057 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2058 const char *src, size_t srcLen) const
2059 {
2060 if ( srcLen == wxNO_LEN )
2061 {
2062 // find the string length: notice that must be done differently for
2063 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2064 // consecutive NULs
2065 const size_t nulLen = GetMBNulLen();
2066 switch ( nulLen )
2067 {
2068 default:
2069 return wxCONV_FAILED;
2070
2071 case 1:
2072 srcLen = strlen(src); // arguably more optimized than our version
2073 break;
2074
2075 case 2:
2076 case 4:
2077 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2078 // but they also have to start at character boundary and not
2079 // span two adjacent characters
2080 const char *p;
2081 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2082 ;
2083 srcLen = p - src;
2084 break;
2085 }
2086 }
2087
2088 // we express length in the number of (wide) characters but iconv always
2089 // counts buffer sizes it in bytes
2090 dstLen *= SIZEOF_WCHAR_T;
2091
2092 #if wxUSE_THREADS
2093 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2094 // Unfortunately there are a couple of global wxCSConv objects such as
2095 // wxConvLocal that are used all over wx code, so we have to make sure
2096 // the handle is used by at most one thread at the time. Otherwise
2097 // only a few wx classes would be safe to use from non-main threads
2098 // as MB<->WC conversion would fail "randomly".
2099 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2100 #endif // wxUSE_THREADS
2101
2102 size_t res, cres;
2103 const char *pszPtr = src;
2104
2105 if ( dst )
2106 {
2107 char* bufPtr = (char*)dst;
2108
2109 // have destination buffer, convert there
2110 cres = iconv(m2w,
2111 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2112 &bufPtr, &dstLen);
2113 res = dstLen - (dstLen / SIZEOF_WCHAR_T);
2114
2115 if (ms_wcNeedsSwap)
2116 {
2117 // convert to native endianness
2118 for ( unsigned i = 0; i < res; i++ )
2119 dst[dstLen] = WC_BSWAP(dst[i]);
2120 }
2121
2122 // NUL-terminate the string if there is any space left
2123 if (res < dstLen)
2124 dst[res] = 0;
2125 }
2126 else // no destination buffer
2127 {
2128 // convert using temp buffer to calculate the size of the buffer needed
2129 wchar_t tbuf[8];
2130 res = 0;
2131
2132 do
2133 {
2134 char* bufPtr = (char*)tbuf;
2135 dstLen = 8 * SIZEOF_WCHAR_T;
2136
2137 cres = iconv(m2w,
2138 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2139 &bufPtr, &dstLen );
2140
2141 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2142 }
2143 while ((cres == (size_t)-1) && (errno == E2BIG));
2144 }
2145
2146 if (ICONV_FAILED(cres, srcLen))
2147 {
2148 //VS: it is ok if iconv fails, hence trace only
2149 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2150 return wxCONV_FAILED;
2151 }
2152
2153 return res;
2154 }
2155
2156 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2157 const wchar_t *src, size_t srcLen) const
2158 {
2159 #if wxUSE_THREADS
2160 // NB: explained in MB2WC
2161 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2162 #endif
2163
2164 if ( srcLen == wxNO_LEN )
2165 srcLen = wxWcslen(src);
2166
2167 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2168 size_t outbuflen = dstLen;
2169 size_t res, cres;
2170
2171 wchar_t *tmpbuf = 0;
2172
2173 if (ms_wcNeedsSwap)
2174 {
2175 // need to copy to temp buffer to switch endianness
2176 // (doing WC_BSWAP twice on the original buffer won't help, as it
2177 // could be in read-only memory, or be accessed in some other thread)
2178 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2179 for ( size_t i = 0; i < srcLen; i++ )
2180 tmpbuf[i] = WC_BSWAP(src[i]);
2181
2182 tmpbuf[srcLen] = L'\0';
2183 src = tmpbuf;
2184 }
2185
2186 char* inbuf = (char*)src;
2187 if ( dst )
2188 {
2189 // have destination buffer, convert there
2190 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2191
2192 res = dstLen - outbuflen;
2193
2194 // NB: iconv was given only wcslen(src) characters on input, and so
2195 // it couldn't convert the trailing zero. Let's do it ourselves
2196 // if there's some room left for it in the output buffer.
2197 if (res < dstLen)
2198 dst[0] = 0;
2199 }
2200 else // no destination buffer
2201 {
2202 // convert using temp buffer to calculate the size of the buffer needed
2203 char tbuf[16];
2204 res = 0;
2205 do
2206 {
2207 dst = tbuf;
2208 outbuflen = 16;
2209
2210 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2211
2212 res += 16 - outbuflen;
2213 }
2214 while ((cres == (size_t)-1) && (errno == E2BIG));
2215 }
2216
2217 if (ms_wcNeedsSwap)
2218 {
2219 free(tmpbuf);
2220 }
2221
2222 if (ICONV_FAILED(cres, inbuflen))
2223 {
2224 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2225 return wxCONV_FAILED;
2226 }
2227
2228 return res;
2229 }
2230
2231 size_t wxMBConv_iconv::GetMBNulLen() const
2232 {
2233 if ( m_minMBCharWidth == 0 )
2234 {
2235 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2236
2237 #if wxUSE_THREADS
2238 // NB: explained in MB2WC
2239 wxMutexLocker lock(self->m_iconvMutex);
2240 #endif
2241
2242 const wchar_t *wnul = L"";
2243 char buf[8]; // should be enough for NUL in any encoding
2244 size_t inLen = sizeof(wchar_t),
2245 outLen = WXSIZEOF(buf);
2246 char *inBuff = (char *)wnul;
2247 char *outBuff = buf;
2248 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2249 {
2250 self->m_minMBCharWidth = (size_t)-1;
2251 }
2252 else // ok
2253 {
2254 self->m_minMBCharWidth = outBuff - buf;
2255 }
2256 }
2257
2258 return m_minMBCharWidth;
2259 }
2260
2261 #if wxUSE_UNICODE_UTF8
2262 bool wxMBConv_iconv::IsUTF8() const
2263 {
2264 return wxStricmp(m_name, "UTF-8") == 0 ||
2265 wxStricmp(m_name, "UTF8") == 0;
2266 }
2267 #endif
2268
2269 #endif // HAVE_ICONV
2270
2271
2272 // ============================================================================
2273 // Win32 conversion classes
2274 // ============================================================================
2275
2276 #ifdef wxHAVE_WIN32_MB2WC
2277
2278 // from utils.cpp
2279 #if wxUSE_FONTMAP
2280 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2281 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2282 #endif
2283
2284 class wxMBConv_win32 : public wxMBConv
2285 {
2286 public:
2287 wxMBConv_win32()
2288 {
2289 m_CodePage = CP_ACP;
2290 m_minMBCharWidth = 0;
2291 }
2292
2293 wxMBConv_win32(const wxMBConv_win32& conv)
2294 : wxMBConv()
2295 {
2296 m_CodePage = conv.m_CodePage;
2297 m_minMBCharWidth = conv.m_minMBCharWidth;
2298 }
2299
2300 #if wxUSE_FONTMAP
2301 wxMBConv_win32(const char* name)
2302 {
2303 m_CodePage = wxCharsetToCodepage(name);
2304 m_minMBCharWidth = 0;
2305 }
2306
2307 wxMBConv_win32(wxFontEncoding encoding)
2308 {
2309 m_CodePage = wxEncodingToCodepage(encoding);
2310 m_minMBCharWidth = 0;
2311 }
2312 #endif // wxUSE_FONTMAP
2313
2314 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2315 {
2316 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2317 // the behaviour is not compatible with the Unix version (using iconv)
2318 // and break the library itself, e.g. wxTextInputStream::NextChar()
2319 // wouldn't work if reading an incomplete MB char didn't result in an
2320 // error
2321 //
2322 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2323 // Win XP or newer and it is not supported for UTF-[78] so we always
2324 // use our own conversions in this case. See
2325 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2326 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2327 if ( m_CodePage == CP_UTF8 )
2328 {
2329 return wxMBConvUTF8().MB2WC(buf, psz, n);
2330 }
2331
2332 if ( m_CodePage == CP_UTF7 )
2333 {
2334 return wxMBConvUTF7().MB2WC(buf, psz, n);
2335 }
2336
2337 int flags = 0;
2338 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2339 IsAtLeastWin2kSP4() )
2340 {
2341 flags = MB_ERR_INVALID_CHARS;
2342 }
2343
2344 const size_t len = ::MultiByteToWideChar
2345 (
2346 m_CodePage, // code page
2347 flags, // flags: fall on error
2348 psz, // input string
2349 -1, // its length (NUL-terminated)
2350 buf, // output string
2351 buf ? n : 0 // size of output buffer
2352 );
2353 if ( !len )
2354 {
2355 // function totally failed
2356 return wxCONV_FAILED;
2357 }
2358
2359 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2360 // check if we succeeded, by doing a double trip:
2361 if ( !flags && buf )
2362 {
2363 const size_t mbLen = strlen(psz);
2364 wxCharBuffer mbBuf(mbLen);
2365 if ( ::WideCharToMultiByte
2366 (
2367 m_CodePage,
2368 0,
2369 buf,
2370 -1,
2371 mbBuf.data(),
2372 mbLen + 1, // size in bytes, not length
2373 NULL,
2374 NULL
2375 ) == 0 ||
2376 strcmp(mbBuf, psz) != 0 )
2377 {
2378 // we didn't obtain the same thing we started from, hence
2379 // the conversion was lossy and we consider that it failed
2380 return wxCONV_FAILED;
2381 }
2382 }
2383
2384 // note that it returns count of written chars for buf != NULL and size
2385 // of the needed buffer for buf == NULL so in either case the length of
2386 // the string (which never includes the terminating NUL) is one less
2387 return len - 1;
2388 }
2389
2390 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2391 {
2392 /*
2393 we have a problem here: by default, WideCharToMultiByte() may
2394 replace characters unrepresentable in the target code page with bad
2395 quality approximations such as turning "1/2" symbol (U+00BD) into
2396 "1" for the code pages which don't have it and we, obviously, want
2397 to avoid this at any price
2398
2399 the trouble is that this function does it _silently_, i.e. it won't
2400 even tell us whether it did or not... Win98/2000 and higher provide
2401 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2402 we have to resort to a round trip, i.e. check that converting back
2403 results in the same string -- this is, of course, expensive but
2404 otherwise we simply can't be sure to not garble the data.
2405 */
2406
2407 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2408 // it doesn't work with CJK encodings (which we test for rather roughly
2409 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2410 // supporting it
2411 BOOL usedDef wxDUMMY_INITIALIZE(false);
2412 BOOL *pUsedDef;
2413 int flags;
2414 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2415 {
2416 // it's our lucky day
2417 flags = WC_NO_BEST_FIT_CHARS;
2418 pUsedDef = &usedDef;
2419 }
2420 else // old system or unsupported encoding
2421 {
2422 flags = 0;
2423 pUsedDef = NULL;
2424 }
2425
2426 const size_t len = ::WideCharToMultiByte
2427 (
2428 m_CodePage, // code page
2429 flags, // either none or no best fit
2430 pwz, // input string
2431 -1, // it is (wide) NUL-terminated
2432 buf, // output buffer
2433 buf ? n : 0, // and its size
2434 NULL, // default "replacement" char
2435 pUsedDef // [out] was it used?
2436 );
2437
2438 if ( !len )
2439 {
2440 // function totally failed
2441 return wxCONV_FAILED;
2442 }
2443
2444 // we did something, check if we really succeeded
2445 if ( flags )
2446 {
2447 // check if the conversion failed, i.e. if any replacements
2448 // were done
2449 if ( usedDef )
2450 return wxCONV_FAILED;
2451 }
2452 else // we must resort to double tripping...
2453 {
2454 // first we need to ensure that we really have the MB data: this is
2455 // not the case if we're called with NULL buffer, in which case we
2456 // need to do the conversion yet again
2457 wxCharBuffer bufDef;
2458 if ( !buf )
2459 {
2460 bufDef = wxCharBuffer(len);
2461 buf = bufDef.data();
2462 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2463 buf, len, NULL, NULL) )
2464 return wxCONV_FAILED;
2465 }
2466
2467 if ( !n )
2468 n = wcslen(pwz);
2469 wxWCharBuffer wcBuf(n);
2470 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2471 wcscmp(wcBuf, pwz) != 0 )
2472 {
2473 // we didn't obtain the same thing we started from, hence
2474 // the conversion was lossy and we consider that it failed
2475 return wxCONV_FAILED;
2476 }
2477 }
2478
2479 // see the comment above for the reason of "len - 1"
2480 return len - 1;
2481 }
2482
2483 virtual size_t GetMBNulLen() const
2484 {
2485 if ( m_minMBCharWidth == 0 )
2486 {
2487 int len = ::WideCharToMultiByte
2488 (
2489 m_CodePage, // code page
2490 0, // no flags
2491 L"", // input string
2492 1, // translate just the NUL
2493 NULL, // output buffer
2494 0, // and its size
2495 NULL, // no replacement char
2496 NULL // [out] don't care if it was used
2497 );
2498
2499 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2500 switch ( len )
2501 {
2502 default:
2503 wxLogDebug(_T("Unexpected NUL length %d"), len);
2504 self->m_minMBCharWidth = (size_t)-1;
2505 break;
2506
2507 case 0:
2508 self->m_minMBCharWidth = (size_t)-1;
2509 break;
2510
2511 case 1:
2512 case 2:
2513 case 4:
2514 self->m_minMBCharWidth = len;
2515 break;
2516 }
2517 }
2518
2519 return m_minMBCharWidth;
2520 }
2521
2522 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2523
2524 bool IsOk() const { return m_CodePage != -1; }
2525
2526 private:
2527 static bool CanUseNoBestFit()
2528 {
2529 static int s_isWin98Or2k = -1;
2530
2531 if ( s_isWin98Or2k == -1 )
2532 {
2533 int verMaj, verMin;
2534 switch ( wxGetOsVersion(&verMaj, &verMin) )
2535 {
2536 case wxOS_WINDOWS_9X:
2537 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2538 break;
2539
2540 case wxOS_WINDOWS_NT:
2541 s_isWin98Or2k = verMaj >= 5;
2542 break;
2543
2544 default:
2545 // unknown: be conservative by default
2546 s_isWin98Or2k = 0;
2547 break;
2548 }
2549
2550 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2551 }
2552
2553 return s_isWin98Or2k == 1;
2554 }
2555
2556 static bool IsAtLeastWin2kSP4()
2557 {
2558 #ifdef __WXWINCE__
2559 return false;
2560 #else
2561 static int s_isAtLeastWin2kSP4 = -1;
2562
2563 if ( s_isAtLeastWin2kSP4 == -1 )
2564 {
2565 OSVERSIONINFOEX ver;
2566
2567 memset(&ver, 0, sizeof(ver));
2568 ver.dwOSVersionInfoSize = sizeof(ver);
2569 GetVersionEx((OSVERSIONINFO*)&ver);
2570
2571 s_isAtLeastWin2kSP4 =
2572 ((ver.dwMajorVersion > 5) || // Vista+
2573 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2574 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2575 ver.wServicePackMajor >= 4)) // 2000 SP4+
2576 ? 1 : 0;
2577 }
2578
2579 return s_isAtLeastWin2kSP4 == 1;
2580 #endif
2581 }
2582
2583
2584 // the code page we're working with
2585 long m_CodePage;
2586
2587 // cached result of GetMBNulLen(), set to 0 initially meaning
2588 // "unknown"
2589 size_t m_minMBCharWidth;
2590 };
2591
2592 #endif // wxHAVE_WIN32_MB2WC
2593
2594
2595 // ============================================================================
2596 // wxEncodingConverter based conversion classes
2597 // ============================================================================
2598
2599 #if wxUSE_FONTMAP
2600
2601 class wxMBConv_wxwin : public wxMBConv
2602 {
2603 private:
2604 void Init()
2605 {
2606 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2607 // The wxMBConv_cf class does a better job.
2608 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2609 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2610 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2611 }
2612
2613 public:
2614 // temporarily just use wxEncodingConverter stuff,
2615 // so that it works while a better implementation is built
2616 wxMBConv_wxwin(const char* name)
2617 {
2618 if (name)
2619 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2620 else
2621 m_enc = wxFONTENCODING_SYSTEM;
2622
2623 Init();
2624 }
2625
2626 wxMBConv_wxwin(wxFontEncoding enc)
2627 {
2628 m_enc = enc;
2629
2630 Init();
2631 }
2632
2633 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2634 {
2635 size_t inbuf = strlen(psz);
2636 if (buf)
2637 {
2638 if (!m2w.Convert(psz, buf))
2639 return wxCONV_FAILED;
2640 }
2641 return inbuf;
2642 }
2643
2644 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2645 {
2646 const size_t inbuf = wxWcslen(psz);
2647 if (buf)
2648 {
2649 if (!w2m.Convert(psz, buf))
2650 return wxCONV_FAILED;
2651 }
2652
2653 return inbuf;
2654 }
2655
2656 virtual size_t GetMBNulLen() const
2657 {
2658 switch ( m_enc )
2659 {
2660 case wxFONTENCODING_UTF16BE:
2661 case wxFONTENCODING_UTF16LE:
2662 return 2;
2663
2664 case wxFONTENCODING_UTF32BE:
2665 case wxFONTENCODING_UTF32LE:
2666 return 4;
2667
2668 default:
2669 return 1;
2670 }
2671 }
2672
2673 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2674
2675 bool IsOk() const { return m_ok; }
2676
2677 public:
2678 wxFontEncoding m_enc;
2679 wxEncodingConverter m2w, w2m;
2680
2681 private:
2682 // were we initialized successfully?
2683 bool m_ok;
2684
2685 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2686 };
2687
2688 // make the constructors available for unit testing
2689 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2690 {
2691 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2692 if ( !result->IsOk() )
2693 {
2694 delete result;
2695 return 0;
2696 }
2697
2698 return result;
2699 }
2700
2701 #endif // wxUSE_FONTMAP
2702
2703 // ============================================================================
2704 // wxCSConv implementation
2705 // ============================================================================
2706
2707 void wxCSConv::Init()
2708 {
2709 m_name = NULL;
2710 m_convReal = NULL;
2711 m_deferred = true;
2712 }
2713
2714 wxCSConv::wxCSConv(const wxString& charset)
2715 {
2716 Init();
2717
2718 if ( !charset.empty() )
2719 {
2720 SetName(charset.ToAscii());
2721 }
2722
2723 #if wxUSE_FONTMAP
2724 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2725 #else
2726 m_encoding = wxFONTENCODING_SYSTEM;
2727 #endif
2728 }
2729
2730 wxCSConv::wxCSConv(wxFontEncoding encoding)
2731 {
2732 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2733 {
2734 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2735
2736 encoding = wxFONTENCODING_SYSTEM;
2737 }
2738
2739 Init();
2740
2741 m_encoding = encoding;
2742 }
2743
2744 wxCSConv::~wxCSConv()
2745 {
2746 Clear();
2747 }
2748
2749 wxCSConv::wxCSConv(const wxCSConv& conv)
2750 : wxMBConv()
2751 {
2752 Init();
2753
2754 SetName(conv.m_name);
2755 m_encoding = conv.m_encoding;
2756 }
2757
2758 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2759 {
2760 Clear();
2761
2762 SetName(conv.m_name);
2763 m_encoding = conv.m_encoding;
2764
2765 return *this;
2766 }
2767
2768 void wxCSConv::Clear()
2769 {
2770 free(m_name);
2771 delete m_convReal;
2772
2773 m_name = NULL;
2774 m_convReal = NULL;
2775 }
2776
2777 void wxCSConv::SetName(const char *charset)
2778 {
2779 if (charset)
2780 {
2781 m_name = wxStrdup(charset);
2782 m_deferred = true;
2783 }
2784 }
2785
2786 #if wxUSE_FONTMAP
2787
2788 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2789 wxEncodingNameCache );
2790
2791 static wxEncodingNameCache gs_nameCache;
2792 #endif
2793
2794 wxMBConv *wxCSConv::DoCreate() const
2795 {
2796 #if wxUSE_FONTMAP
2797 wxLogTrace(TRACE_STRCONV,
2798 wxT("creating conversion for %s"),
2799 (m_name ? m_name
2800 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2801 #endif // wxUSE_FONTMAP
2802
2803 // check for the special case of ASCII or ISO8859-1 charset: as we have
2804 // special knowledge of it anyhow, we don't need to create a special
2805 // conversion object
2806 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2807 m_encoding == wxFONTENCODING_DEFAULT )
2808 {
2809 // don't convert at all
2810 return NULL;
2811 }
2812
2813 // we trust OS to do conversion better than we can so try external
2814 // conversion methods first
2815 //
2816 // the full order is:
2817 // 1. OS conversion (iconv() under Unix or Win32 API)
2818 // 2. hard coded conversions for UTF
2819 // 3. wxEncodingConverter as fall back
2820
2821 // step (1)
2822 #ifdef HAVE_ICONV
2823 #if !wxUSE_FONTMAP
2824 if ( m_name )
2825 #endif // !wxUSE_FONTMAP
2826 {
2827 #if wxUSE_FONTMAP
2828 wxFontEncoding encoding(m_encoding);
2829 #endif
2830
2831 if ( m_name )
2832 {
2833 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2834 if ( conv->IsOk() )
2835 return conv;
2836
2837 delete conv;
2838
2839 #if wxUSE_FONTMAP
2840 encoding =
2841 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2842 #endif // wxUSE_FONTMAP
2843 }
2844 #if wxUSE_FONTMAP
2845 {
2846 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2847 if ( it != gs_nameCache.end() )
2848 {
2849 if ( it->second.empty() )
2850 return NULL;
2851
2852 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2853 if ( conv->IsOk() )
2854 return conv;
2855
2856 delete conv;
2857 }
2858
2859 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2860 // CS : in case this does not return valid names (eg for MacRoman)
2861 // encoding got a 'failure' entry in the cache all the same,
2862 // although it just has to be created using a different method, so
2863 // only store failed iconv creation attempts (or perhaps we
2864 // shoulnd't do this at all ?)
2865 if ( names[0] != NULL )
2866 {
2867 for ( ; *names; ++names )
2868 {
2869 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2870 // will need changes that will obsolete this
2871 wxString name(*names);
2872 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2873 if ( conv->IsOk() )
2874 {
2875 gs_nameCache[encoding] = *names;
2876 return conv;
2877 }
2878
2879 delete conv;
2880 }
2881
2882 gs_nameCache[encoding] = _T(""); // cache the failure
2883 }
2884 }
2885 #endif // wxUSE_FONTMAP
2886 }
2887 #endif // HAVE_ICONV
2888
2889 #ifdef wxHAVE_WIN32_MB2WC
2890 {
2891 #if wxUSE_FONTMAP
2892 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2893 : new wxMBConv_win32(m_encoding);
2894 if ( conv->IsOk() )
2895 return conv;
2896
2897 delete conv;
2898 #else
2899 return NULL;
2900 #endif
2901 }
2902 #endif // wxHAVE_WIN32_MB2WC
2903
2904 #ifdef __DARWIN__
2905 {
2906 // leave UTF16 and UTF32 to the built-ins of wx
2907 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2908 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2909 {
2910 #if wxUSE_FONTMAP
2911 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2912 : new wxMBConv_cf(m_encoding);
2913 #else
2914 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2915 #endif
2916
2917 if ( conv->IsOk() )
2918 return conv;
2919
2920 delete conv;
2921 }
2922 }
2923 #endif // __DARWIN__
2924
2925 // step (2)
2926 wxFontEncoding enc = m_encoding;
2927 #if wxUSE_FONTMAP
2928 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2929 {
2930 // use "false" to suppress interactive dialogs -- we can be called from
2931 // anywhere and popping up a dialog from here is the last thing we want to
2932 // do
2933 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2934 }
2935 #endif // wxUSE_FONTMAP
2936
2937 switch ( enc )
2938 {
2939 case wxFONTENCODING_UTF7:
2940 return new wxMBConvUTF7;
2941
2942 case wxFONTENCODING_UTF8:
2943 return new wxMBConvUTF8;
2944
2945 case wxFONTENCODING_UTF16BE:
2946 return new wxMBConvUTF16BE;
2947
2948 case wxFONTENCODING_UTF16LE:
2949 return new wxMBConvUTF16LE;
2950
2951 case wxFONTENCODING_UTF32BE:
2952 return new wxMBConvUTF32BE;
2953
2954 case wxFONTENCODING_UTF32LE:
2955 return new wxMBConvUTF32LE;
2956
2957 default:
2958 // nothing to do but put here to suppress gcc warnings
2959 break;
2960 }
2961
2962 // step (3)
2963 #if wxUSE_FONTMAP
2964 {
2965 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2966 : new wxMBConv_wxwin(m_encoding);
2967 if ( conv->IsOk() )
2968 return conv;
2969
2970 delete conv;
2971 }
2972 #endif // wxUSE_FONTMAP
2973
2974 // NB: This is a hack to prevent deadlock. What could otherwise happen
2975 // in Unicode build: wxConvLocal creation ends up being here
2976 // because of some failure and logs the error. But wxLog will try to
2977 // attach a timestamp, for which it will need wxConvLocal (to convert
2978 // time to char* and then wchar_t*), but that fails, tries to log the
2979 // error, but wxLog has an (already locked) critical section that
2980 // guards the static buffer.
2981 static bool alreadyLoggingError = false;
2982 if (!alreadyLoggingError)
2983 {
2984 alreadyLoggingError = true;
2985 wxLogError(_("Cannot convert from the charset '%s'!"),
2986 m_name ? m_name
2987 :
2988 #if wxUSE_FONTMAP
2989 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2990 #else // !wxUSE_FONTMAP
2991 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2992 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2993 );
2994
2995 alreadyLoggingError = false;
2996 }
2997
2998 return NULL;
2999 }
3000
3001 void wxCSConv::CreateConvIfNeeded() const
3002 {
3003 if ( m_deferred )
3004 {
3005 wxCSConv *self = (wxCSConv *)this; // const_cast
3006
3007 // if we don't have neither the name nor the encoding, use the default
3008 // encoding for this system
3009 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3010 {
3011 #if wxUSE_INTL
3012 self->m_encoding = wxLocale::GetSystemEncoding();
3013 #else
3014 // fallback to some reasonable default:
3015 self->m_encoding = wxFONTENCODING_ISO8859_1;
3016 #endif // wxUSE_INTL
3017 }
3018
3019 self->m_convReal = DoCreate();
3020 self->m_deferred = false;
3021 }
3022 }
3023
3024 bool wxCSConv::IsOk() const
3025 {
3026 CreateConvIfNeeded();
3027
3028 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3029 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3030 return true; // always ok as we do it ourselves
3031
3032 // m_convReal->IsOk() is called at its own creation, so we know it must
3033 // be ok if m_convReal is non-NULL
3034 return m_convReal != NULL;
3035 }
3036
3037 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3038 const char *src, size_t srcLen) const
3039 {
3040 CreateConvIfNeeded();
3041
3042 if (m_convReal)
3043 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3044
3045 // latin-1 (direct)
3046 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3047 }
3048
3049 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3050 const wchar_t *src, size_t srcLen) const
3051 {
3052 CreateConvIfNeeded();
3053
3054 if (m_convReal)
3055 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3056
3057 // latin-1 (direct)
3058 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3059 }
3060
3061 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3062 {
3063 CreateConvIfNeeded();
3064
3065 if (m_convReal)
3066 return m_convReal->MB2WC(buf, psz, n);
3067
3068 // latin-1 (direct)
3069 size_t len = strlen(psz);
3070
3071 if (buf)
3072 {
3073 for (size_t c = 0; c <= len; c++)
3074 buf[c] = (unsigned char)(psz[c]);
3075 }
3076
3077 return len;
3078 }
3079
3080 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3081 {
3082 CreateConvIfNeeded();
3083
3084 if (m_convReal)
3085 return m_convReal->WC2MB(buf, psz, n);
3086
3087 // latin-1 (direct)
3088 const size_t len = wxWcslen(psz);
3089 if (buf)
3090 {
3091 for (size_t c = 0; c <= len; c++)
3092 {
3093 if (psz[c] > 0xFF)
3094 return wxCONV_FAILED;
3095
3096 buf[c] = (char)psz[c];
3097 }
3098 }
3099 else
3100 {
3101 for (size_t c = 0; c <= len; c++)
3102 {
3103 if (psz[c] > 0xFF)
3104 return wxCONV_FAILED;
3105 }
3106 }
3107
3108 return len;
3109 }
3110
3111 size_t wxCSConv::GetMBNulLen() const
3112 {
3113 CreateConvIfNeeded();
3114
3115 if ( m_convReal )
3116 {
3117 return m_convReal->GetMBNulLen();
3118 }
3119
3120 // otherwise, we are ISO-8859-1
3121 return 1;
3122 }
3123
3124 #if wxUSE_UNICODE_UTF8
3125 bool wxCSConv::IsUTF8() const
3126 {
3127 CreateConvIfNeeded();
3128
3129 if ( m_convReal )
3130 {
3131 return m_convReal->IsUTF8();
3132 }
3133
3134 // otherwise, we are ISO-8859-1
3135 return false;
3136 }
3137 #endif
3138
3139
3140 #if wxUSE_UNICODE
3141
3142 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3143 {
3144 if ( !s )
3145 return wxWCharBuffer();
3146
3147 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3148 if ( !wbuf )
3149 wbuf = wxMBConvUTF8().cMB2WX(s);
3150 if ( !wbuf )
3151 wbuf = wxConvISO8859_1.cMB2WX(s);
3152
3153 return wbuf;
3154 }
3155
3156 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3157 {
3158 if ( !ws )
3159 return wxCharBuffer();
3160
3161 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3162 if ( !buf )
3163 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3164
3165 return buf;
3166 }
3167
3168 #endif // wxUSE_UNICODE
3169
3170 // ----------------------------------------------------------------------------
3171 // globals
3172 // ----------------------------------------------------------------------------
3173
3174 // NB: The reason why we create converted objects in this convoluted way,
3175 // using a factory function instead of global variable, is that they
3176 // may be used at static initialization time (some of them are used by
3177 // wxString ctors and there may be a global wxString object). In other
3178 // words, possibly _before_ the converter global object would be
3179 // initialized.
3180
3181 #undef wxConvLibc
3182 #undef wxConvUTF8
3183 #undef wxConvUTF7
3184 #undef wxConvLocal
3185 #undef wxConvISO8859_1
3186
3187 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3188 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3189 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3190 { \
3191 static impl_klass name##Obj ctor_args; \
3192 return &name##Obj; \
3193 } \
3194 /* this ensures that all global converter objects are created */ \
3195 /* by the time static initialization is done, i.e. before any */ \
3196 /* thread is launched: */ \
3197 static klass* gs_##name##instance = wxGet_##name##Ptr()
3198
3199 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3200 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3201
3202 #ifdef __WINDOWS__
3203 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3204 #else
3205 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3206 #endif
3207
3208 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3209 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3210 // provokes an error message about "not enough macro parameters"; and we
3211 // can't use "()" here as the name##Obj declaration would be parsed as a
3212 // function declaration then, so use a semicolon and live with an extra
3213 // empty statement (and hope that no compilers warns about this)
3214 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3215 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3216
3217 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3218 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3219
3220 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3221 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3222
3223 #ifdef __DARWIN__
3224 // The xnu kernel always communicates file paths in decomposed UTF-8.
3225 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3226 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3227 #endif
3228
3229 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3230 #ifdef __DARWIN__
3231 &wxConvMacUTF8DObj;
3232 #else // !__DARWIN__
3233 wxGet_wxConvLibcPtr();
3234 #endif // __DARWIN__/!__DARWIN__
3235
3236 #else // !wxUSE_WCHAR_T
3237
3238 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3239 // stand-ins in absence of wchar_t
3240 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3241 wxConvISO8859_1,
3242 wxConvLocal,
3243 wxConvUTF8;
3244
3245 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T