always NUL-terminate the buffers returned by cWC2MB/cMB2WC() overloads taking input...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef __SALFORDC__
48 #include <clib.h>
49 #endif
50
51 #ifdef HAVE_ICONV
52 #include <iconv.h>
53 #include "wx/thread.h"
54 #endif
55
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
58
59 #ifdef __DARWIN__
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
62
63
64 #define TRACE_STRCONV _T("strconv")
65
66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
67 // be 4 bytes
68 #if SIZEOF_WCHAR_T == 2
69 #define WC_UTF16
70 #endif
71
72
73 // ============================================================================
74 // implementation
75 // ============================================================================
76
77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
78 static bool NotAllNULs(const char *p, size_t n)
79 {
80 while ( n && *p++ == '\0' )
81 n--;
82
83 return n != 0;
84 }
85
86 // ----------------------------------------------------------------------------
87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
88 // ----------------------------------------------------------------------------
89
90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
91 {
92 if (input <= 0xffff)
93 {
94 if (output)
95 *output = (wxUint16) input;
96
97 return 1;
98 }
99 else if (input >= 0x110000)
100 {
101 return wxCONV_FAILED;
102 }
103 else
104 {
105 if (output)
106 {
107 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
108 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
109 }
110
111 return 2;
112 }
113 }
114
115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
116 {
117 if ((*input < 0xd800) || (*input > 0xdfff))
118 {
119 output = *input;
120 return 1;
121 }
122 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
123 {
124 output = *input;
125 return wxCONV_FAILED;
126 }
127 else
128 {
129 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
130 return 2;
131 }
132 }
133
134 #ifdef WC_UTF16
135 typedef wchar_t wxDecodeSurrogate_t;
136 #else // !WC_UTF16
137 typedef wxUint16 wxDecodeSurrogate_t;
138 #endif // WC_UTF16/!WC_UTF16
139
140 // returns the next UTF-32 character from the wchar_t buffer and advances the
141 // pointer to the character after this one
142 //
143 // if an invalid character is found, *pSrc is set to NULL, the caller must
144 // check for this
145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
146 {
147 wxUint32 out;
148 const size_t
149 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
150 if ( n == wxCONV_FAILED )
151 *pSrc = NULL;
152 else
153 *pSrc += n;
154
155 return out;
156 }
157
158 // ----------------------------------------------------------------------------
159 // wxMBConv
160 // ----------------------------------------------------------------------------
161
162 size_t
163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
164 const char *src, size_t srcLen) const
165 {
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 for ( ;; )
213 {
214 // try to convert the current chunk
215 size_t lenChunk = MB2WC(NULL, src, 0);
216 if ( lenChunk == wxCONV_FAILED )
217 return wxCONV_FAILED;
218
219 lenChunk++; // for the L'\0' at the end of this chunk
220
221 dstWritten += lenChunk;
222
223 if ( lenChunk == 1 )
224 {
225 // nothing left in the input string, conversion succeeded
226 break;
227 }
228
229 if ( dst )
230 {
231 if ( dstWritten > dstLen )
232 return wxCONV_FAILED;
233
234 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
235 return wxCONV_FAILED;
236
237 dst += lenChunk;
238 }
239
240 if ( !srcEnd )
241 {
242 // we convert just one chunk in this case as this is the entire
243 // string anyhow
244 break;
245 }
246
247 // advance the input pointer past the end of this chunk
248 while ( NotAllNULs(src, nulLen) )
249 {
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
254 src += nulLen;
255 }
256
257 src += nulLen; // skipping over its terminator as well
258
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
262 if ( src >= srcEnd )
263 break;
264 }
265
266 return dstWritten;
267 }
268
269 size_t
270 wxMBConv::FromWChar(char *dst, size_t dstLen,
271 const wchar_t *src, size_t srcLen) const
272 {
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten = 0;
275
276 // make a copy of the input string unless it is already properly
277 // NUL-terminated
278 //
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp;
282 if ( srcLen == wxNO_LEN )
283 {
284 srcLen = wxWcslen(src) + 1;
285 }
286 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
287 {
288 // make a copy in order to properly NUL-terminate the string
289 bufTmp = wxWCharBuffer(srcLen);
290 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
291 src = bufTmp;
292 }
293
294 const size_t lenNul = GetMBNulLen();
295 for ( const wchar_t * const srcEnd = src + srcLen;
296 src < srcEnd;
297 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
298 {
299 // try to convert the current chunk
300 size_t lenChunk = WC2MB(NULL, src, 0);
301
302 if ( lenChunk == wxCONV_FAILED )
303 return wxCONV_FAILED;
304
305 lenChunk += lenNul;
306 dstWritten += lenChunk;
307
308 if ( dst )
309 {
310 if ( dstWritten > dstLen )
311 return wxCONV_FAILED;
312
313 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dst += lenChunk;
317 }
318 }
319
320 return dstWritten;
321 }
322
323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
324 {
325 size_t rc = ToWChar(outBuff, outLen, inBuff);
326 if ( rc != wxCONV_FAILED )
327 {
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
330 rc--;
331 }
332
333 return rc;
334 }
335
336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
337 {
338 size_t rc = FromWChar(outBuff, outLen, inBuff);
339 if ( rc != wxCONV_FAILED )
340 {
341 rc -= GetMBNulLen();
342 }
343
344 return rc;
345 }
346
347 wxMBConv::~wxMBConv()
348 {
349 // nothing to do here (necessary for Darwin linking probably)
350 }
351
352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
353 {
354 if ( psz )
355 {
356 // calculate the length of the buffer needed first
357 const size_t nLen = ToWChar(NULL, 0, psz);
358 if ( nLen != wxCONV_FAILED )
359 {
360 // now do the actual conversion
361 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
362
363 // +1 for the trailing NULL
364 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
365 return buf;
366 }
367 }
368
369 return wxWCharBuffer();
370 }
371
372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
373 {
374 if ( pwz )
375 {
376 const size_t nLen = FromWChar(NULL, 0, pwz);
377 if ( nLen != wxCONV_FAILED )
378 {
379 wxCharBuffer buf(nLen - 1);
380 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
381 return buf;
382 }
383 }
384
385 return wxCharBuffer();
386 }
387
388 const wxWCharBuffer
389 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
390 {
391 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
392 if ( dstLen != wxCONV_FAILED )
393 {
394 // notice that we allocate space for dstLen+1 wide characters here
395 // because we want the buffer to always be NUL-terminated, even if the
396 // input isn't (as otherwise the caller has no way to know its length)
397 wxWCharBuffer wbuf(dstLen);
398 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
399 {
400 if ( outLen )
401 {
402 *outLen = dstLen;
403 if ( wbuf[dstLen - 1] == L'\0' )
404 (*outLen)--;
405 }
406
407 return wbuf;
408 }
409 }
410
411 if ( outLen )
412 *outLen = 0;
413
414 return wxWCharBuffer();
415 }
416
417 const wxCharBuffer
418 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
419 {
420 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
421 if ( dstLen != wxCONV_FAILED )
422 {
423 const size_t nulLen = GetMBNulLen();
424
425 // as above, ensure that the buffer is always NUL-terminated, even if
426 // the input is not
427 wxCharBuffer buf(dstLen + nulLen - 1);
428 memset(buf.data() + dstLen, 0, nulLen);
429 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
430 {
431 if ( outLen )
432 {
433 *outLen = dstLen;
434
435 if ( dstLen >= nulLen &&
436 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
437 {
438 // in this case the output is NUL-terminated and we're not
439 // supposed to count NUL
440 *outLen -= nulLen;
441 }
442 }
443
444 return buf;
445 }
446 }
447
448 if ( outLen )
449 *outLen = 0;
450
451 return wxCharBuffer();
452 }
453
454 // ----------------------------------------------------------------------------
455 // wxMBConvLibc
456 // ----------------------------------------------------------------------------
457
458 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
459 {
460 return wxMB2WC(buf, psz, n);
461 }
462
463 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
464 {
465 return wxWC2MB(buf, psz, n);
466 }
467
468 // ----------------------------------------------------------------------------
469 // wxConvBrokenFileNames
470 // ----------------------------------------------------------------------------
471
472 #ifdef __UNIX__
473
474 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
475 {
476 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
477 wxStricmp(charset, _T("UTF8")) == 0 )
478 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
479 else
480 m_conv = new wxCSConv(charset);
481 }
482
483 #endif // __UNIX__
484
485 // ----------------------------------------------------------------------------
486 // UTF-7
487 // ----------------------------------------------------------------------------
488
489 // Implementation (C) 2004 Fredrik Roubert
490
491 //
492 // BASE64 decoding table
493 //
494 static const unsigned char utf7unb64[] =
495 {
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
502 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
503 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
505 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
506 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
507 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
509 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
510 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
511 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
528 };
529
530 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
531 {
532 size_t len = 0;
533
534 while ( *psz && (!buf || (len < n)) )
535 {
536 unsigned char cc = *psz++;
537 if (cc != '+')
538 {
539 // plain ASCII char
540 if (buf)
541 *buf++ = cc;
542 len++;
543 }
544 else if (*psz == '-')
545 {
546 // encoded plus sign
547 if (buf)
548 *buf++ = cc;
549 len++;
550 psz++;
551 }
552 else // start of BASE64 encoded string
553 {
554 bool lsb, ok;
555 unsigned int d, l;
556 for ( ok = lsb = false, d = 0, l = 0;
557 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
558 psz++ )
559 {
560 d <<= 6;
561 d += cc;
562 for (l += 6; l >= 8; lsb = !lsb)
563 {
564 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
565 if (lsb)
566 {
567 if (buf)
568 *buf++ |= c;
569 len ++;
570 }
571 else
572 {
573 if (buf)
574 *buf = (wchar_t)(c << 8);
575 }
576
577 ok = true;
578 }
579 }
580
581 if ( !ok )
582 {
583 // in valid UTF7 we should have valid characters after '+'
584 return wxCONV_FAILED;
585 }
586
587 if (*psz == '-')
588 psz++;
589 }
590 }
591
592 if ( buf && (len < n) )
593 *buf = '\0';
594
595 return len;
596 }
597
598 //
599 // BASE64 encoding table
600 //
601 static const unsigned char utf7enb64[] =
602 {
603 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
604 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
605 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
606 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
607 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
608 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
609 'w', 'x', 'y', 'z', '0', '1', '2', '3',
610 '4', '5', '6', '7', '8', '9', '+', '/'
611 };
612
613 //
614 // UTF-7 encoding table
615 //
616 // 0 - Set D (directly encoded characters)
617 // 1 - Set O (optional direct characters)
618 // 2 - whitespace characters (optional)
619 // 3 - special characters
620 //
621 static const unsigned char utf7encode[128] =
622 {
623 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
624 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
625 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
627 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
629 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
631 };
632
633 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
634 {
635 size_t len = 0;
636
637 while (*psz && ((!buf) || (len < n)))
638 {
639 wchar_t cc = *psz++;
640 if (cc < 0x80 && utf7encode[cc] < 1)
641 {
642 // plain ASCII char
643 if (buf)
644 *buf++ = (char)cc;
645
646 len++;
647 }
648 #ifndef WC_UTF16
649 else if (((wxUint32)cc) > 0xffff)
650 {
651 // no surrogate pair generation (yet?)
652 return wxCONV_FAILED;
653 }
654 #endif
655 else
656 {
657 if (buf)
658 *buf++ = '+';
659
660 len++;
661 if (cc != '+')
662 {
663 // BASE64 encode string
664 unsigned int lsb, d, l;
665 for (d = 0, l = 0; /*nothing*/; psz++)
666 {
667 for (lsb = 0; lsb < 2; lsb ++)
668 {
669 d <<= 8;
670 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
671
672 for (l += 8; l >= 6; )
673 {
674 l -= 6;
675 if (buf)
676 *buf++ = utf7enb64[(d >> l) % 64];
677 len++;
678 }
679 }
680
681 cc = *psz;
682 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
683 break;
684 }
685
686 if (l != 0)
687 {
688 if (buf)
689 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
690
691 len++;
692 }
693 }
694
695 if (buf)
696 *buf++ = '-';
697 len++;
698 }
699 }
700
701 if (buf && (len < n))
702 *buf = 0;
703
704 return len;
705 }
706
707 // ----------------------------------------------------------------------------
708 // UTF-8
709 // ----------------------------------------------------------------------------
710
711 static const wxUint32 utf8_max[]=
712 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
713
714 // boundaries of the private use area we use to (temporarily) remap invalid
715 // characters invalid in a UTF-8 encoded string
716 const wxUint32 wxUnicodePUA = 0x100000;
717 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
718
719 // this table gives the length of the UTF-8 encoding from its first character:
720 const unsigned char tableUtf8Lengths[256] = {
721 // single-byte sequences (ASCII):
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
726 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
727 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
728 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
729 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
730
731 // these are invalid:
732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
735 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
736 0, 0, // C0,C1
737
738 // two-byte sequences:
739 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
740 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
741
742 // three-byte sequences:
743 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
744
745 // four-byte sequences:
746 4, 4, 4, 4, 4, // F0..F4
747
748 // these are invalid again (5- or 6-byte
749 // sequences and sequences for code points
750 // above U+10FFFF, as restricted by RFC 3629):
751 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
752 };
753
754 size_t
755 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
756 const char *src, size_t srcLen) const
757 {
758 wchar_t *out = dstLen ? dst : NULL;
759 size_t written = 0;
760
761 if ( srcLen == wxNO_LEN )
762 srcLen = strlen(src) + 1;
763
764 for ( const char *p = src; ; p++ )
765 {
766 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
767 {
768 // all done successfully, just add the trailing NULL if we are not
769 // using explicit length
770 if ( srcLen == wxNO_LEN )
771 {
772 if ( out )
773 {
774 if ( !dstLen )
775 break;
776
777 *out = L'\0';
778 }
779
780 written++;
781 }
782
783 return written;
784 }
785
786 if ( out && !dstLen-- )
787 break;
788
789 wxUint32 code;
790 unsigned char c = *p;
791
792 if ( c < 0x80 )
793 {
794 if ( srcLen == 0 ) // the test works for wxNO_LEN too
795 break;
796
797 if ( srcLen != wxNO_LEN )
798 srcLen--;
799
800 code = c;
801 }
802 else
803 {
804 unsigned len = tableUtf8Lengths[c];
805 if ( !len )
806 break;
807
808 if ( srcLen < len ) // the test works for wxNO_LEN too
809 break;
810
811 if ( srcLen != wxNO_LEN )
812 srcLen -= len;
813
814 // Char. number range | UTF-8 octet sequence
815 // (hexadecimal) | (binary)
816 // ----------------------+----------------------------------------
817 // 0000 0000 - 0000 007F | 0xxxxxxx
818 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
819 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
820 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
821 //
822 // Code point value is stored in bits marked with 'x',
823 // lowest-order bit of the value on the right side in the diagram
824 // above. (from RFC 3629)
825
826 // mask to extract lead byte's value ('x' bits above), by sequence
827 // length:
828 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
829
830 // mask and value of lead byte's most significant bits, by length:
831 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
832 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
833
834 len--; // it's more convenient to work with 0-based length here
835
836 // extract the lead byte's value bits:
837 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
838 break;
839
840 code = c & leadValueMask[len];
841
842 // all remaining bytes, if any, are handled in the same way
843 // regardless of sequence's length:
844 for ( ; len; --len )
845 {
846 c = *++p;
847 if ( (c & 0xC0) != 0x80 )
848 return wxCONV_FAILED;
849
850 code <<= 6;
851 code |= c & 0x3F;
852 }
853 }
854
855 #ifdef WC_UTF16
856 // cast is ok because wchar_t == wxUint16 if WC_UTF16
857 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
858 {
859 if ( out )
860 out++;
861 written++;
862 }
863 #else // !WC_UTF16
864 if ( out )
865 *out = code;
866 #endif // WC_UTF16/!WC_UTF16
867
868 if ( out )
869 out++;
870
871 written++;
872 }
873
874 return wxCONV_FAILED;
875 }
876
877 size_t
878 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
879 const wchar_t *src, size_t srcLen) const
880 {
881 char *out = dstLen ? dst : NULL;
882 size_t written = 0;
883
884 for ( const wchar_t *wp = src; ; wp++ )
885 {
886 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
887 {
888 // all done successfully, just add the trailing NULL if we are not
889 // using explicit length
890 if ( srcLen == wxNO_LEN )
891 {
892 if ( out )
893 {
894 if ( !dstLen )
895 break;
896
897 *out = '\0';
898 }
899
900 written++;
901 }
902
903 return written;
904 }
905
906
907 wxUint32 code;
908 #ifdef WC_UTF16
909 // cast is ok for WC_UTF16
910 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
911 {
912 // skip the next char too as we decoded a surrogate
913 wp++;
914 }
915 #else // wchar_t is UTF-32
916 code = *wp & 0x7fffffff;
917 #endif
918
919 unsigned len;
920 if ( code <= 0x7F )
921 {
922 len = 1;
923 if ( out )
924 {
925 if ( dstLen < len )
926 break;
927
928 out[0] = (char)code;
929 }
930 }
931 else if ( code <= 0x07FF )
932 {
933 len = 2;
934 if ( out )
935 {
936 if ( dstLen < len )
937 break;
938
939 // NB: this line takes 6 least significant bits, encodes them as
940 // 10xxxxxx and discards them so that the next byte can be encoded:
941 out[1] = 0x80 | (code & 0x3F); code >>= 6;
942 out[0] = 0xC0 | code;
943 }
944 }
945 else if ( code < 0xFFFF )
946 {
947 len = 3;
948 if ( out )
949 {
950 if ( dstLen < len )
951 break;
952
953 out[2] = 0x80 | (code & 0x3F); code >>= 6;
954 out[1] = 0x80 | (code & 0x3F); code >>= 6;
955 out[0] = 0xE0 | code;
956 }
957 }
958 else if ( code <= 0x10FFFF )
959 {
960 len = 4;
961 if ( out )
962 {
963 if ( dstLen < len )
964 break;
965
966 out[3] = 0x80 | (code & 0x3F); code >>= 6;
967 out[2] = 0x80 | (code & 0x3F); code >>= 6;
968 out[1] = 0x80 | (code & 0x3F); code >>= 6;
969 out[0] = 0xF0 | code;
970 }
971 }
972 else
973 {
974 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
975 break;
976 }
977
978 if ( out )
979 {
980 out += len;
981 dstLen -= len;
982 }
983
984 written += len;
985 }
986
987 // we only get here if an error occurs during decoding
988 return wxCONV_FAILED;
989 }
990
991 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
992 const char *psz, size_t srcLen) const
993 {
994 if ( m_options == MAP_INVALID_UTF8_NOT )
995 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
996
997 size_t len = 0;
998
999 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1000 {
1001 const char *opsz = psz;
1002 bool invalid = false;
1003 unsigned char cc = *psz++, fc = cc;
1004 unsigned cnt;
1005 for (cnt = 0; fc & 0x80; cnt++)
1006 fc <<= 1;
1007
1008 if (!cnt)
1009 {
1010 // plain ASCII char
1011 if (buf)
1012 *buf++ = cc;
1013 len++;
1014
1015 // escape the escape character for octal escapes
1016 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1017 && cc == '\\' && (!buf || len < n))
1018 {
1019 if (buf)
1020 *buf++ = cc;
1021 len++;
1022 }
1023 }
1024 else
1025 {
1026 cnt--;
1027 if (!cnt)
1028 {
1029 // invalid UTF-8 sequence
1030 invalid = true;
1031 }
1032 else
1033 {
1034 unsigned ocnt = cnt - 1;
1035 wxUint32 res = cc & (0x3f >> cnt);
1036 while (cnt--)
1037 {
1038 cc = *psz;
1039 if ((cc & 0xC0) != 0x80)
1040 {
1041 // invalid UTF-8 sequence
1042 invalid = true;
1043 break;
1044 }
1045
1046 psz++;
1047 res = (res << 6) | (cc & 0x3f);
1048 }
1049
1050 if (invalid || res <= utf8_max[ocnt])
1051 {
1052 // illegal UTF-8 encoding
1053 invalid = true;
1054 }
1055 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1056 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1057 {
1058 // if one of our PUA characters turns up externally
1059 // it must also be treated as an illegal sequence
1060 // (a bit like you have to escape an escape character)
1061 invalid = true;
1062 }
1063 else
1064 {
1065 #ifdef WC_UTF16
1066 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1067 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1068 if (pa == wxCONV_FAILED)
1069 {
1070 invalid = true;
1071 }
1072 else
1073 {
1074 if (buf)
1075 buf += pa;
1076 len += pa;
1077 }
1078 #else // !WC_UTF16
1079 if (buf)
1080 *buf++ = (wchar_t)res;
1081 len++;
1082 #endif // WC_UTF16/!WC_UTF16
1083 }
1084 }
1085
1086 if (invalid)
1087 {
1088 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1089 {
1090 while (opsz < psz && (!buf || len < n))
1091 {
1092 #ifdef WC_UTF16
1093 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1094 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1095 wxASSERT(pa != wxCONV_FAILED);
1096 if (buf)
1097 buf += pa;
1098 opsz++;
1099 len += pa;
1100 #else
1101 if (buf)
1102 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1103 opsz++;
1104 len++;
1105 #endif
1106 }
1107 }
1108 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1109 {
1110 while (opsz < psz && (!buf || len < n))
1111 {
1112 if ( buf && len + 3 < n )
1113 {
1114 unsigned char on = *opsz;
1115 *buf++ = L'\\';
1116 *buf++ = (wchar_t)( L'0' + on / 0100 );
1117 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1118 *buf++ = (wchar_t)( L'0' + on % 010 );
1119 }
1120
1121 opsz++;
1122 len += 4;
1123 }
1124 }
1125 else // MAP_INVALID_UTF8_NOT
1126 {
1127 return wxCONV_FAILED;
1128 }
1129 }
1130 }
1131 }
1132
1133 if (srcLen == wxNO_LEN && buf && (len < n))
1134 *buf = 0;
1135
1136 return len + 1;
1137 }
1138
1139 static inline bool isoctal(wchar_t wch)
1140 {
1141 return L'0' <= wch && wch <= L'7';
1142 }
1143
1144 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1145 const wchar_t *psz, size_t srcLen) const
1146 {
1147 if ( m_options == MAP_INVALID_UTF8_NOT )
1148 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1149
1150 size_t len = 0;
1151
1152 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1153 {
1154 wxUint32 cc;
1155
1156 #ifdef WC_UTF16
1157 // cast is ok for WC_UTF16
1158 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1159 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1160 #else
1161 cc = (*psz++) & 0x7fffffff;
1162 #endif
1163
1164 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1165 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1166 {
1167 if (buf)
1168 *buf++ = (char)(cc - wxUnicodePUA);
1169 len++;
1170 }
1171 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1172 && cc == L'\\' && psz[0] == L'\\' )
1173 {
1174 if (buf)
1175 *buf++ = (char)cc;
1176 psz++;
1177 len++;
1178 }
1179 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1180 cc == L'\\' &&
1181 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1182 {
1183 if (buf)
1184 {
1185 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1186 (psz[1] - L'0') * 010 +
1187 (psz[2] - L'0'));
1188 }
1189
1190 psz += 3;
1191 len++;
1192 }
1193 else
1194 {
1195 unsigned cnt;
1196 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1197 {
1198 }
1199
1200 if (!cnt)
1201 {
1202 // plain ASCII char
1203 if (buf)
1204 *buf++ = (char) cc;
1205 len++;
1206 }
1207 else
1208 {
1209 len += cnt + 1;
1210 if (buf)
1211 {
1212 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1213 while (cnt--)
1214 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1215 }
1216 }
1217 }
1218 }
1219
1220 if (srcLen == wxNO_LEN && buf && (len < n))
1221 *buf = 0;
1222
1223 return len + 1;
1224 }
1225
1226 // ============================================================================
1227 // UTF-16
1228 // ============================================================================
1229
1230 #ifdef WORDS_BIGENDIAN
1231 #define wxMBConvUTF16straight wxMBConvUTF16BE
1232 #define wxMBConvUTF16swap wxMBConvUTF16LE
1233 #else
1234 #define wxMBConvUTF16swap wxMBConvUTF16BE
1235 #define wxMBConvUTF16straight wxMBConvUTF16LE
1236 #endif
1237
1238 /* static */
1239 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1240 {
1241 if ( srcLen == wxNO_LEN )
1242 {
1243 // count the number of bytes in input, including the trailing NULs
1244 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1245 for ( srcLen = 1; *inBuff++; srcLen++ )
1246 ;
1247
1248 srcLen *= BYTES_PER_CHAR;
1249 }
1250 else // we already have the length
1251 {
1252 // we can only convert an entire number of UTF-16 characters
1253 if ( srcLen % BYTES_PER_CHAR )
1254 return wxCONV_FAILED;
1255 }
1256
1257 return srcLen;
1258 }
1259
1260 // case when in-memory representation is UTF-16 too
1261 #ifdef WC_UTF16
1262
1263 // ----------------------------------------------------------------------------
1264 // conversions without endianness change
1265 // ----------------------------------------------------------------------------
1266
1267 size_t
1268 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1269 const char *src, size_t srcLen) const
1270 {
1271 // set up the scene for using memcpy() (which is presumably more efficient
1272 // than copying the bytes one by one)
1273 srcLen = GetLength(src, srcLen);
1274 if ( srcLen == wxNO_LEN )
1275 return wxCONV_FAILED;
1276
1277 const size_t inLen = srcLen / BYTES_PER_CHAR;
1278 if ( dst )
1279 {
1280 if ( dstLen < inLen )
1281 return wxCONV_FAILED;
1282
1283 memcpy(dst, src, srcLen);
1284 }
1285
1286 return inLen;
1287 }
1288
1289 size_t
1290 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1291 const wchar_t *src, size_t srcLen) const
1292 {
1293 if ( srcLen == wxNO_LEN )
1294 srcLen = wxWcslen(src) + 1;
1295
1296 srcLen *= BYTES_PER_CHAR;
1297
1298 if ( dst )
1299 {
1300 if ( dstLen < srcLen )
1301 return wxCONV_FAILED;
1302
1303 memcpy(dst, src, srcLen);
1304 }
1305
1306 return srcLen;
1307 }
1308
1309 // ----------------------------------------------------------------------------
1310 // endian-reversing conversions
1311 // ----------------------------------------------------------------------------
1312
1313 size_t
1314 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1315 const char *src, size_t srcLen) const
1316 {
1317 srcLen = GetLength(src, srcLen);
1318 if ( srcLen == wxNO_LEN )
1319 return wxCONV_FAILED;
1320
1321 srcLen /= BYTES_PER_CHAR;
1322
1323 if ( dst )
1324 {
1325 if ( dstLen < srcLen )
1326 return wxCONV_FAILED;
1327
1328 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1329 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1330 {
1331 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1332 }
1333 }
1334
1335 return srcLen;
1336 }
1337
1338 size_t
1339 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1340 const wchar_t *src, size_t srcLen) const
1341 {
1342 if ( srcLen == wxNO_LEN )
1343 srcLen = wxWcslen(src) + 1;
1344
1345 srcLen *= BYTES_PER_CHAR;
1346
1347 if ( dst )
1348 {
1349 if ( dstLen < srcLen )
1350 return wxCONV_FAILED;
1351
1352 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1353 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1354 {
1355 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1356 }
1357 }
1358
1359 return srcLen;
1360 }
1361
1362 #else // !WC_UTF16: wchar_t is UTF-32
1363
1364 // ----------------------------------------------------------------------------
1365 // conversions without endianness change
1366 // ----------------------------------------------------------------------------
1367
1368 size_t
1369 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1370 const char *src, size_t srcLen) const
1371 {
1372 srcLen = GetLength(src, srcLen);
1373 if ( srcLen == wxNO_LEN )
1374 return wxCONV_FAILED;
1375
1376 const size_t inLen = srcLen / BYTES_PER_CHAR;
1377 if ( !dst )
1378 {
1379 // optimization: return maximal space which could be needed for this
1380 // string even if the real size could be smaller if the buffer contains
1381 // any surrogates
1382 return inLen;
1383 }
1384
1385 size_t outLen = 0;
1386 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1387 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1388 {
1389 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1390 if ( !inBuff )
1391 return wxCONV_FAILED;
1392
1393 if ( ++outLen > dstLen )
1394 return wxCONV_FAILED;
1395
1396 *dst++ = ch;
1397 }
1398
1399
1400 return outLen;
1401 }
1402
1403 size_t
1404 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1405 const wchar_t *src, size_t srcLen) const
1406 {
1407 if ( srcLen == wxNO_LEN )
1408 srcLen = wxWcslen(src) + 1;
1409
1410 size_t outLen = 0;
1411 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1412 for ( size_t n = 0; n < srcLen; n++ )
1413 {
1414 wxUint16 cc[2];
1415 const size_t numChars = encode_utf16(*src++, cc);
1416 if ( numChars == wxCONV_FAILED )
1417 return wxCONV_FAILED;
1418
1419 outLen += numChars * BYTES_PER_CHAR;
1420 if ( outBuff )
1421 {
1422 if ( outLen > dstLen )
1423 return wxCONV_FAILED;
1424
1425 *outBuff++ = cc[0];
1426 if ( numChars == 2 )
1427 {
1428 // second character of a surrogate
1429 *outBuff++ = cc[1];
1430 }
1431 }
1432 }
1433
1434 return outLen;
1435 }
1436
1437 // ----------------------------------------------------------------------------
1438 // endian-reversing conversions
1439 // ----------------------------------------------------------------------------
1440
1441 size_t
1442 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1443 const char *src, size_t srcLen) const
1444 {
1445 srcLen = GetLength(src, srcLen);
1446 if ( srcLen == wxNO_LEN )
1447 return wxCONV_FAILED;
1448
1449 const size_t inLen = srcLen / BYTES_PER_CHAR;
1450 if ( !dst )
1451 {
1452 // optimization: return maximal space which could be needed for this
1453 // string even if the real size could be smaller if the buffer contains
1454 // any surrogates
1455 return inLen;
1456 }
1457
1458 size_t outLen = 0;
1459 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1460 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1461 {
1462 wxUint32 ch;
1463 wxUint16 tmp[2];
1464
1465 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1466 inBuff++;
1467 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1468
1469 const size_t numChars = decode_utf16(tmp, ch);
1470 if ( numChars == wxCONV_FAILED )
1471 return wxCONV_FAILED;
1472
1473 if ( numChars == 2 )
1474 inBuff++;
1475
1476 if ( ++outLen > dstLen )
1477 return wxCONV_FAILED;
1478
1479 *dst++ = ch;
1480 }
1481
1482
1483 return outLen;
1484 }
1485
1486 size_t
1487 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1488 const wchar_t *src, size_t srcLen) const
1489 {
1490 if ( srcLen == wxNO_LEN )
1491 srcLen = wxWcslen(src) + 1;
1492
1493 size_t outLen = 0;
1494 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1495 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1496 {
1497 wxUint16 cc[2];
1498 const size_t numChars = encode_utf16(*src, cc);
1499 if ( numChars == wxCONV_FAILED )
1500 return wxCONV_FAILED;
1501
1502 outLen += numChars * BYTES_PER_CHAR;
1503 if ( outBuff )
1504 {
1505 if ( outLen > dstLen )
1506 return wxCONV_FAILED;
1507
1508 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1509 if ( numChars == 2 )
1510 {
1511 // second character of a surrogate
1512 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1513 }
1514 }
1515 }
1516
1517 return outLen;
1518 }
1519
1520 #endif // WC_UTF16/!WC_UTF16
1521
1522
1523 // ============================================================================
1524 // UTF-32
1525 // ============================================================================
1526
1527 #ifdef WORDS_BIGENDIAN
1528 #define wxMBConvUTF32straight wxMBConvUTF32BE
1529 #define wxMBConvUTF32swap wxMBConvUTF32LE
1530 #else
1531 #define wxMBConvUTF32swap wxMBConvUTF32BE
1532 #define wxMBConvUTF32straight wxMBConvUTF32LE
1533 #endif
1534
1535
1536 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1537 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1538
1539 /* static */
1540 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1541 {
1542 if ( srcLen == wxNO_LEN )
1543 {
1544 // count the number of bytes in input, including the trailing NULs
1545 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1546 for ( srcLen = 1; *inBuff++; srcLen++ )
1547 ;
1548
1549 srcLen *= BYTES_PER_CHAR;
1550 }
1551 else // we already have the length
1552 {
1553 // we can only convert an entire number of UTF-32 characters
1554 if ( srcLen % BYTES_PER_CHAR )
1555 return wxCONV_FAILED;
1556 }
1557
1558 return srcLen;
1559 }
1560
1561 // case when in-memory representation is UTF-16
1562 #ifdef WC_UTF16
1563
1564 // ----------------------------------------------------------------------------
1565 // conversions without endianness change
1566 // ----------------------------------------------------------------------------
1567
1568 size_t
1569 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1570 const char *src, size_t srcLen) const
1571 {
1572 srcLen = GetLength(src, srcLen);
1573 if ( srcLen == wxNO_LEN )
1574 return wxCONV_FAILED;
1575
1576 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1577 const size_t inLen = srcLen / BYTES_PER_CHAR;
1578 size_t outLen = 0;
1579 for ( size_t n = 0; n < inLen; n++ )
1580 {
1581 wxUint16 cc[2];
1582 const size_t numChars = encode_utf16(*inBuff++, cc);
1583 if ( numChars == wxCONV_FAILED )
1584 return wxCONV_FAILED;
1585
1586 outLen += numChars;
1587 if ( dst )
1588 {
1589 if ( outLen > dstLen )
1590 return wxCONV_FAILED;
1591
1592 *dst++ = cc[0];
1593 if ( numChars == 2 )
1594 {
1595 // second character of a surrogate
1596 *dst++ = cc[1];
1597 }
1598 }
1599 }
1600
1601 return outLen;
1602 }
1603
1604 size_t
1605 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1606 const wchar_t *src, size_t srcLen) const
1607 {
1608 if ( srcLen == wxNO_LEN )
1609 srcLen = wxWcslen(src) + 1;
1610
1611 if ( !dst )
1612 {
1613 // optimization: return maximal space which could be needed for this
1614 // string instead of the exact amount which could be less if there are
1615 // any surrogates in the input
1616 //
1617 // we consider that surrogates are rare enough to make it worthwhile to
1618 // avoid running the loop below at the cost of slightly extra memory
1619 // consumption
1620 return srcLen * BYTES_PER_CHAR;
1621 }
1622
1623 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1624 size_t outLen = 0;
1625 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1626 {
1627 const wxUint32 ch = wxDecodeSurrogate(&src);
1628 if ( !src )
1629 return wxCONV_FAILED;
1630
1631 outLen += BYTES_PER_CHAR;
1632
1633 if ( outLen > dstLen )
1634 return wxCONV_FAILED;
1635
1636 *outBuff++ = ch;
1637 }
1638
1639 return outLen;
1640 }
1641
1642 // ----------------------------------------------------------------------------
1643 // endian-reversing conversions
1644 // ----------------------------------------------------------------------------
1645
1646 size_t
1647 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1648 const char *src, size_t srcLen) const
1649 {
1650 srcLen = GetLength(src, srcLen);
1651 if ( srcLen == wxNO_LEN )
1652 return wxCONV_FAILED;
1653
1654 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1655 const size_t inLen = srcLen / BYTES_PER_CHAR;
1656 size_t outLen = 0;
1657 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1658 {
1659 wxUint16 cc[2];
1660 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1661 if ( numChars == wxCONV_FAILED )
1662 return wxCONV_FAILED;
1663
1664 outLen += numChars;
1665 if ( dst )
1666 {
1667 if ( outLen > dstLen )
1668 return wxCONV_FAILED;
1669
1670 *dst++ = cc[0];
1671 if ( numChars == 2 )
1672 {
1673 // second character of a surrogate
1674 *dst++ = cc[1];
1675 }
1676 }
1677 }
1678
1679 return outLen;
1680 }
1681
1682 size_t
1683 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1684 const wchar_t *src, size_t srcLen) const
1685 {
1686 if ( srcLen == wxNO_LEN )
1687 srcLen = wxWcslen(src) + 1;
1688
1689 if ( !dst )
1690 {
1691 // optimization: return maximal space which could be needed for this
1692 // string instead of the exact amount which could be less if there are
1693 // any surrogates in the input
1694 //
1695 // we consider that surrogates are rare enough to make it worthwhile to
1696 // avoid running the loop below at the cost of slightly extra memory
1697 // consumption
1698 return srcLen*BYTES_PER_CHAR;
1699 }
1700
1701 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1702 size_t outLen = 0;
1703 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1704 {
1705 const wxUint32 ch = wxDecodeSurrogate(&src);
1706 if ( !src )
1707 return wxCONV_FAILED;
1708
1709 outLen += BYTES_PER_CHAR;
1710
1711 if ( outLen > dstLen )
1712 return wxCONV_FAILED;
1713
1714 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1715 }
1716
1717 return outLen;
1718 }
1719
1720 #else // !WC_UTF16: wchar_t is UTF-32
1721
1722 // ----------------------------------------------------------------------------
1723 // conversions without endianness change
1724 // ----------------------------------------------------------------------------
1725
1726 size_t
1727 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1728 const char *src, size_t srcLen) const
1729 {
1730 // use memcpy() as it should be much faster than hand-written loop
1731 srcLen = GetLength(src, srcLen);
1732 if ( srcLen == wxNO_LEN )
1733 return wxCONV_FAILED;
1734
1735 const size_t inLen = srcLen/BYTES_PER_CHAR;
1736 if ( dst )
1737 {
1738 if ( dstLen < inLen )
1739 return wxCONV_FAILED;
1740
1741 memcpy(dst, src, srcLen);
1742 }
1743
1744 return inLen;
1745 }
1746
1747 size_t
1748 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1749 const wchar_t *src, size_t srcLen) const
1750 {
1751 if ( srcLen == wxNO_LEN )
1752 srcLen = wxWcslen(src) + 1;
1753
1754 srcLen *= BYTES_PER_CHAR;
1755
1756 if ( dst )
1757 {
1758 if ( dstLen < srcLen )
1759 return wxCONV_FAILED;
1760
1761 memcpy(dst, src, srcLen);
1762 }
1763
1764 return srcLen;
1765 }
1766
1767 // ----------------------------------------------------------------------------
1768 // endian-reversing conversions
1769 // ----------------------------------------------------------------------------
1770
1771 size_t
1772 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1773 const char *src, size_t srcLen) const
1774 {
1775 srcLen = GetLength(src, srcLen);
1776 if ( srcLen == wxNO_LEN )
1777 return wxCONV_FAILED;
1778
1779 srcLen /= BYTES_PER_CHAR;
1780
1781 if ( dst )
1782 {
1783 if ( dstLen < srcLen )
1784 return wxCONV_FAILED;
1785
1786 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1787 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1788 {
1789 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1790 }
1791 }
1792
1793 return srcLen;
1794 }
1795
1796 size_t
1797 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1798 const wchar_t *src, size_t srcLen) const
1799 {
1800 if ( srcLen == wxNO_LEN )
1801 srcLen = wxWcslen(src) + 1;
1802
1803 srcLen *= BYTES_PER_CHAR;
1804
1805 if ( dst )
1806 {
1807 if ( dstLen < srcLen )
1808 return wxCONV_FAILED;
1809
1810 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1811 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1812 {
1813 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1814 }
1815 }
1816
1817 return srcLen;
1818 }
1819
1820 #endif // WC_UTF16/!WC_UTF16
1821
1822
1823 // ============================================================================
1824 // The classes doing conversion using the iconv_xxx() functions
1825 // ============================================================================
1826
1827 #ifdef HAVE_ICONV
1828
1829 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1830 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1831 // (unless there's yet another bug in glibc) the only case when iconv()
1832 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1833 // left in the input buffer -- when _real_ error occurs,
1834 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1835 // iconv() failure.
1836 // [This bug does not appear in glibc 2.2.]
1837 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1838 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1839 (errno != E2BIG || bufLeft != 0))
1840 #else
1841 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1842 #endif
1843
1844 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1845
1846 #define ICONV_T_INVALID ((iconv_t)-1)
1847
1848 #if SIZEOF_WCHAR_T == 4
1849 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1850 #define WC_ENC wxFONTENCODING_UTF32
1851 #elif SIZEOF_WCHAR_T == 2
1852 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1853 #define WC_ENC wxFONTENCODING_UTF16
1854 #else // sizeof(wchar_t) != 2 nor 4
1855 // does this ever happen?
1856 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1857 #endif
1858
1859 // ----------------------------------------------------------------------------
1860 // wxMBConv_iconv: encapsulates an iconv character set
1861 // ----------------------------------------------------------------------------
1862
1863 class wxMBConv_iconv : public wxMBConv
1864 {
1865 public:
1866 wxMBConv_iconv(const char *name);
1867 virtual ~wxMBConv_iconv();
1868
1869 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1870 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1871
1872 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1873 virtual size_t GetMBNulLen() const;
1874
1875 #if wxUSE_UNICODE_UTF8
1876 virtual bool IsUTF8() const;
1877 #endif
1878
1879 virtual wxMBConv *Clone() const
1880 {
1881 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1882 p->m_minMBCharWidth = m_minMBCharWidth;
1883 return p;
1884 }
1885
1886 bool IsOk() const
1887 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1888
1889 protected:
1890 // the iconv handlers used to translate from multibyte
1891 // to wide char and in the other direction
1892 iconv_t m2w,
1893 w2m;
1894
1895 #if wxUSE_THREADS
1896 // guards access to m2w and w2m objects
1897 wxMutex m_iconvMutex;
1898 #endif
1899
1900 private:
1901 // the name (for iconv_open()) of a wide char charset -- if none is
1902 // available on this machine, it will remain NULL
1903 static wxString ms_wcCharsetName;
1904
1905 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1906 // different endian-ness than the native one
1907 static bool ms_wcNeedsSwap;
1908
1909
1910 // name of the encoding handled by this conversion
1911 wxString m_name;
1912
1913 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1914 // initially
1915 size_t m_minMBCharWidth;
1916 };
1917
1918 // make the constructor available for unit testing
1919 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1920 {
1921 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1922 if ( !result->IsOk() )
1923 {
1924 delete result;
1925 return 0;
1926 }
1927
1928 return result;
1929 }
1930
1931 wxString wxMBConv_iconv::ms_wcCharsetName;
1932 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1933
1934 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1935 : m_name(name)
1936 {
1937 m_minMBCharWidth = 0;
1938
1939 // check for charset that represents wchar_t:
1940 if ( ms_wcCharsetName.empty() )
1941 {
1942 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1943
1944 #if wxUSE_FONTMAP
1945 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1946 #else // !wxUSE_FONTMAP
1947 static const wxChar *names_static[] =
1948 {
1949 #if SIZEOF_WCHAR_T == 4
1950 _T("UCS-4"),
1951 #elif SIZEOF_WCHAR_T = 2
1952 _T("UCS-2"),
1953 #endif
1954 NULL
1955 };
1956 const wxChar **names = names_static;
1957 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1958
1959 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1960 {
1961 const wxString nameCS(*names);
1962
1963 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1964 wxString nameXE(nameCS);
1965
1966 #ifdef WORDS_BIGENDIAN
1967 nameXE += _T("BE");
1968 #else // little endian
1969 nameXE += _T("LE");
1970 #endif
1971
1972 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1973 nameXE.c_str());
1974
1975 m2w = iconv_open(nameXE.ToAscii(), name);
1976 if ( m2w == ICONV_T_INVALID )
1977 {
1978 // try charset w/o bytesex info (e.g. "UCS4")
1979 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1980 nameCS.c_str());
1981 m2w = iconv_open(nameCS.ToAscii(), name);
1982
1983 // and check for bytesex ourselves:
1984 if ( m2w != ICONV_T_INVALID )
1985 {
1986 char buf[2], *bufPtr;
1987 wchar_t wbuf[2], *wbufPtr;
1988 size_t insz, outsz;
1989 size_t res;
1990
1991 buf[0] = 'A';
1992 buf[1] = 0;
1993 wbuf[0] = 0;
1994 insz = 2;
1995 outsz = SIZEOF_WCHAR_T * 2;
1996 wbufPtr = wbuf;
1997 bufPtr = buf;
1998
1999 res = iconv(
2000 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2001 (char**)&wbufPtr, &outsz);
2002
2003 if (ICONV_FAILED(res, insz))
2004 {
2005 wxLogLastError(wxT("iconv"));
2006 wxLogError(_("Conversion to charset '%s' doesn't work."),
2007 nameCS.c_str());
2008 }
2009 else // ok, can convert to this encoding, remember it
2010 {
2011 ms_wcCharsetName = nameCS;
2012 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2013 }
2014 }
2015 }
2016 else // use charset not requiring byte swapping
2017 {
2018 ms_wcCharsetName = nameXE;
2019 }
2020 }
2021
2022 wxLogTrace(TRACE_STRCONV,
2023 wxT("iconv wchar_t charset is \"%s\"%s"),
2024 ms_wcCharsetName.empty() ? wxString("<none>")
2025 : ms_wcCharsetName,
2026 ms_wcNeedsSwap ? _T(" (needs swap)")
2027 : _T(""));
2028 }
2029 else // we already have ms_wcCharsetName
2030 {
2031 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2032 }
2033
2034 if ( ms_wcCharsetName.empty() )
2035 {
2036 w2m = ICONV_T_INVALID;
2037 }
2038 else
2039 {
2040 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2041 if ( w2m == ICONV_T_INVALID )
2042 {
2043 wxLogTrace(TRACE_STRCONV,
2044 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2045 ms_wcCharsetName.c_str(), name);
2046 }
2047 }
2048 }
2049
2050 wxMBConv_iconv::~wxMBConv_iconv()
2051 {
2052 if ( m2w != ICONV_T_INVALID )
2053 iconv_close(m2w);
2054 if ( w2m != ICONV_T_INVALID )
2055 iconv_close(w2m);
2056 }
2057
2058 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2059 {
2060 // find the string length: notice that must be done differently for
2061 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2062 size_t inbuf;
2063 const size_t nulLen = GetMBNulLen();
2064 switch ( nulLen )
2065 {
2066 default:
2067 return wxCONV_FAILED;
2068
2069 case 1:
2070 inbuf = strlen(psz); // arguably more optimized than our version
2071 break;
2072
2073 case 2:
2074 case 4:
2075 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2076 // they also have to start at character boundary and not span two
2077 // adjacent characters
2078 const char *p;
2079 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2080 ;
2081 inbuf = p - psz;
2082 break;
2083 }
2084
2085 #if wxUSE_THREADS
2086 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2087 // Unfortunately there are a couple of global wxCSConv objects such as
2088 // wxConvLocal that are used all over wx code, so we have to make sure
2089 // the handle is used by at most one thread at the time. Otherwise
2090 // only a few wx classes would be safe to use from non-main threads
2091 // as MB<->WC conversion would fail "randomly".
2092 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2093 #endif // wxUSE_THREADS
2094
2095 size_t outbuf = n * SIZEOF_WCHAR_T;
2096 size_t res, cres;
2097 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2098 wchar_t *bufPtr = buf;
2099 const char *pszPtr = psz;
2100
2101 if (buf)
2102 {
2103 // have destination buffer, convert there
2104 cres = iconv(m2w,
2105 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2106 (char**)&bufPtr, &outbuf);
2107 res = n - (outbuf / SIZEOF_WCHAR_T);
2108
2109 if (ms_wcNeedsSwap)
2110 {
2111 // convert to native endianness
2112 for ( unsigned i = 0; i < res; i++ )
2113 buf[n] = WC_BSWAP(buf[i]);
2114 }
2115
2116 // NUL-terminate the string if there is any space left
2117 if (res < n)
2118 buf[res] = 0;
2119 }
2120 else
2121 {
2122 // no destination buffer... convert using temp buffer
2123 // to calculate destination buffer requirement
2124 wchar_t tbuf[8];
2125 res = 0;
2126
2127 do
2128 {
2129 bufPtr = tbuf;
2130 outbuf = 8 * SIZEOF_WCHAR_T;
2131
2132 cres = iconv(m2w,
2133 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2134 (char**)&bufPtr, &outbuf );
2135
2136 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2137 }
2138 while ((cres == (size_t)-1) && (errno == E2BIG));
2139 }
2140
2141 if (ICONV_FAILED(cres, inbuf))
2142 {
2143 //VS: it is ok if iconv fails, hence trace only
2144 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2145 return wxCONV_FAILED;
2146 }
2147
2148 return res;
2149 }
2150
2151 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2152 {
2153 #if wxUSE_THREADS
2154 // NB: explained in MB2WC
2155 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2156 #endif
2157
2158 size_t inlen = wxWcslen(psz);
2159 size_t inbuf = inlen * SIZEOF_WCHAR_T;
2160 size_t outbuf = n;
2161 size_t res, cres;
2162
2163 wchar_t *tmpbuf = 0;
2164
2165 if (ms_wcNeedsSwap)
2166 {
2167 // need to copy to temp buffer to switch endianness
2168 // (doing WC_BSWAP twice on the original buffer won't help, as it
2169 // could be in read-only memory, or be accessed in some other thread)
2170 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2171 for ( size_t i = 0; i < inlen; i++ )
2172 tmpbuf[n] = WC_BSWAP(psz[i]);
2173
2174 tmpbuf[inlen] = L'\0';
2175 psz = tmpbuf;
2176 }
2177
2178 if (buf)
2179 {
2180 // have destination buffer, convert there
2181 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2182
2183 res = n - outbuf;
2184
2185 // NB: iconv was given only wcslen(psz) characters on input, and so
2186 // it couldn't convert the trailing zero. Let's do it ourselves
2187 // if there's some room left for it in the output buffer.
2188 if (res < n)
2189 buf[0] = 0;
2190 }
2191 else
2192 {
2193 // no destination buffer: convert using temp buffer
2194 // to calculate destination buffer requirement
2195 char tbuf[16];
2196 res = 0;
2197 do
2198 {
2199 buf = tbuf;
2200 outbuf = 16;
2201
2202 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2203
2204 res += 16 - outbuf;
2205 }
2206 while ((cres == (size_t)-1) && (errno == E2BIG));
2207 }
2208
2209 if (ms_wcNeedsSwap)
2210 {
2211 free(tmpbuf);
2212 }
2213
2214 if (ICONV_FAILED(cres, inbuf))
2215 {
2216 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2217 return wxCONV_FAILED;
2218 }
2219
2220 return res;
2221 }
2222
2223 size_t wxMBConv_iconv::GetMBNulLen() const
2224 {
2225 if ( m_minMBCharWidth == 0 )
2226 {
2227 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2228
2229 #if wxUSE_THREADS
2230 // NB: explained in MB2WC
2231 wxMutexLocker lock(self->m_iconvMutex);
2232 #endif
2233
2234 const wchar_t *wnul = L"";
2235 char buf[8]; // should be enough for NUL in any encoding
2236 size_t inLen = sizeof(wchar_t),
2237 outLen = WXSIZEOF(buf);
2238 char *inBuff = (char *)wnul;
2239 char *outBuff = buf;
2240 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2241 {
2242 self->m_minMBCharWidth = (size_t)-1;
2243 }
2244 else // ok
2245 {
2246 self->m_minMBCharWidth = outBuff - buf;
2247 }
2248 }
2249
2250 return m_minMBCharWidth;
2251 }
2252
2253 #if wxUSE_UNICODE_UTF8
2254 bool wxMBConv_iconv::IsUTF8() const
2255 {
2256 return wxStricmp(m_name, "UTF-8") == 0 ||
2257 wxStricmp(m_name, "UTF8") == 0;
2258 }
2259 #endif
2260
2261 #endif // HAVE_ICONV
2262
2263
2264 // ============================================================================
2265 // Win32 conversion classes
2266 // ============================================================================
2267
2268 #ifdef wxHAVE_WIN32_MB2WC
2269
2270 // from utils.cpp
2271 #if wxUSE_FONTMAP
2272 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2273 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2274 #endif
2275
2276 class wxMBConv_win32 : public wxMBConv
2277 {
2278 public:
2279 wxMBConv_win32()
2280 {
2281 m_CodePage = CP_ACP;
2282 m_minMBCharWidth = 0;
2283 }
2284
2285 wxMBConv_win32(const wxMBConv_win32& conv)
2286 : wxMBConv()
2287 {
2288 m_CodePage = conv.m_CodePage;
2289 m_minMBCharWidth = conv.m_minMBCharWidth;
2290 }
2291
2292 #if wxUSE_FONTMAP
2293 wxMBConv_win32(const char* name)
2294 {
2295 m_CodePage = wxCharsetToCodepage(name);
2296 m_minMBCharWidth = 0;
2297 }
2298
2299 wxMBConv_win32(wxFontEncoding encoding)
2300 {
2301 m_CodePage = wxEncodingToCodepage(encoding);
2302 m_minMBCharWidth = 0;
2303 }
2304 #endif // wxUSE_FONTMAP
2305
2306 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2307 {
2308 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2309 // the behaviour is not compatible with the Unix version (using iconv)
2310 // and break the library itself, e.g. wxTextInputStream::NextChar()
2311 // wouldn't work if reading an incomplete MB char didn't result in an
2312 // error
2313 //
2314 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2315 // Win XP or newer and it is not supported for UTF-[78] so we always
2316 // use our own conversions in this case. See
2317 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2318 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2319 if ( m_CodePage == CP_UTF8 )
2320 {
2321 return wxMBConvUTF8().MB2WC(buf, psz, n);
2322 }
2323
2324 if ( m_CodePage == CP_UTF7 )
2325 {
2326 return wxMBConvUTF7().MB2WC(buf, psz, n);
2327 }
2328
2329 int flags = 0;
2330 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2331 IsAtLeastWin2kSP4() )
2332 {
2333 flags = MB_ERR_INVALID_CHARS;
2334 }
2335
2336 const size_t len = ::MultiByteToWideChar
2337 (
2338 m_CodePage, // code page
2339 flags, // flags: fall on error
2340 psz, // input string
2341 -1, // its length (NUL-terminated)
2342 buf, // output string
2343 buf ? n : 0 // size of output buffer
2344 );
2345 if ( !len )
2346 {
2347 // function totally failed
2348 return wxCONV_FAILED;
2349 }
2350
2351 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2352 // check if we succeeded, by doing a double trip:
2353 if ( !flags && buf )
2354 {
2355 const size_t mbLen = strlen(psz);
2356 wxCharBuffer mbBuf(mbLen);
2357 if ( ::WideCharToMultiByte
2358 (
2359 m_CodePage,
2360 0,
2361 buf,
2362 -1,
2363 mbBuf.data(),
2364 mbLen + 1, // size in bytes, not length
2365 NULL,
2366 NULL
2367 ) == 0 ||
2368 strcmp(mbBuf, psz) != 0 )
2369 {
2370 // we didn't obtain the same thing we started from, hence
2371 // the conversion was lossy and we consider that it failed
2372 return wxCONV_FAILED;
2373 }
2374 }
2375
2376 // note that it returns count of written chars for buf != NULL and size
2377 // of the needed buffer for buf == NULL so in either case the length of
2378 // the string (which never includes the terminating NUL) is one less
2379 return len - 1;
2380 }
2381
2382 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2383 {
2384 /*
2385 we have a problem here: by default, WideCharToMultiByte() may
2386 replace characters unrepresentable in the target code page with bad
2387 quality approximations such as turning "1/2" symbol (U+00BD) into
2388 "1" for the code pages which don't have it and we, obviously, want
2389 to avoid this at any price
2390
2391 the trouble is that this function does it _silently_, i.e. it won't
2392 even tell us whether it did or not... Win98/2000 and higher provide
2393 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2394 we have to resort to a round trip, i.e. check that converting back
2395 results in the same string -- this is, of course, expensive but
2396 otherwise we simply can't be sure to not garble the data.
2397 */
2398
2399 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2400 // it doesn't work with CJK encodings (which we test for rather roughly
2401 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2402 // supporting it
2403 BOOL usedDef wxDUMMY_INITIALIZE(false);
2404 BOOL *pUsedDef;
2405 int flags;
2406 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2407 {
2408 // it's our lucky day
2409 flags = WC_NO_BEST_FIT_CHARS;
2410 pUsedDef = &usedDef;
2411 }
2412 else // old system or unsupported encoding
2413 {
2414 flags = 0;
2415 pUsedDef = NULL;
2416 }
2417
2418 const size_t len = ::WideCharToMultiByte
2419 (
2420 m_CodePage, // code page
2421 flags, // either none or no best fit
2422 pwz, // input string
2423 -1, // it is (wide) NUL-terminated
2424 buf, // output buffer
2425 buf ? n : 0, // and its size
2426 NULL, // default "replacement" char
2427 pUsedDef // [out] was it used?
2428 );
2429
2430 if ( !len )
2431 {
2432 // function totally failed
2433 return wxCONV_FAILED;
2434 }
2435
2436 // we did something, check if we really succeeded
2437 if ( flags )
2438 {
2439 // check if the conversion failed, i.e. if any replacements
2440 // were done
2441 if ( usedDef )
2442 return wxCONV_FAILED;
2443 }
2444 else // we must resort to double tripping...
2445 {
2446 // first we need to ensure that we really have the MB data: this is
2447 // not the case if we're called with NULL buffer, in which case we
2448 // need to do the conversion yet again
2449 wxCharBuffer bufDef;
2450 if ( !buf )
2451 {
2452 bufDef = wxCharBuffer(len);
2453 buf = bufDef.data();
2454 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2455 buf, len, NULL, NULL) )
2456 return wxCONV_FAILED;
2457 }
2458
2459 wxWCharBuffer wcBuf(n);
2460 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2461 wcscmp(wcBuf, pwz) != 0 )
2462 {
2463 // we didn't obtain the same thing we started from, hence
2464 // the conversion was lossy and we consider that it failed
2465 return wxCONV_FAILED;
2466 }
2467 }
2468
2469 // see the comment above for the reason of "len - 1"
2470 return len - 1;
2471 }
2472
2473 virtual size_t GetMBNulLen() const
2474 {
2475 if ( m_minMBCharWidth == 0 )
2476 {
2477 int len = ::WideCharToMultiByte
2478 (
2479 m_CodePage, // code page
2480 0, // no flags
2481 L"", // input string
2482 1, // translate just the NUL
2483 NULL, // output buffer
2484 0, // and its size
2485 NULL, // no replacement char
2486 NULL // [out] don't care if it was used
2487 );
2488
2489 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2490 switch ( len )
2491 {
2492 default:
2493 wxLogDebug(_T("Unexpected NUL length %d"), len);
2494 self->m_minMBCharWidth = (size_t)-1;
2495 break;
2496
2497 case 0:
2498 self->m_minMBCharWidth = (size_t)-1;
2499 break;
2500
2501 case 1:
2502 case 2:
2503 case 4:
2504 self->m_minMBCharWidth = len;
2505 break;
2506 }
2507 }
2508
2509 return m_minMBCharWidth;
2510 }
2511
2512 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2513
2514 bool IsOk() const { return m_CodePage != -1; }
2515
2516 private:
2517 static bool CanUseNoBestFit()
2518 {
2519 static int s_isWin98Or2k = -1;
2520
2521 if ( s_isWin98Or2k == -1 )
2522 {
2523 int verMaj, verMin;
2524 switch ( wxGetOsVersion(&verMaj, &verMin) )
2525 {
2526 case wxOS_WINDOWS_9X:
2527 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2528 break;
2529
2530 case wxOS_WINDOWS_NT:
2531 s_isWin98Or2k = verMaj >= 5;
2532 break;
2533
2534 default:
2535 // unknown: be conservative by default
2536 s_isWin98Or2k = 0;
2537 break;
2538 }
2539
2540 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2541 }
2542
2543 return s_isWin98Or2k == 1;
2544 }
2545
2546 static bool IsAtLeastWin2kSP4()
2547 {
2548 #ifdef __WXWINCE__
2549 return false;
2550 #else
2551 static int s_isAtLeastWin2kSP4 = -1;
2552
2553 if ( s_isAtLeastWin2kSP4 == -1 )
2554 {
2555 OSVERSIONINFOEX ver;
2556
2557 memset(&ver, 0, sizeof(ver));
2558 ver.dwOSVersionInfoSize = sizeof(ver);
2559 GetVersionEx((OSVERSIONINFO*)&ver);
2560
2561 s_isAtLeastWin2kSP4 =
2562 ((ver.dwMajorVersion > 5) || // Vista+
2563 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2564 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2565 ver.wServicePackMajor >= 4)) // 2000 SP4+
2566 ? 1 : 0;
2567 }
2568
2569 return s_isAtLeastWin2kSP4 == 1;
2570 #endif
2571 }
2572
2573
2574 // the code page we're working with
2575 long m_CodePage;
2576
2577 // cached result of GetMBNulLen(), set to 0 initially meaning
2578 // "unknown"
2579 size_t m_minMBCharWidth;
2580 };
2581
2582 #endif // wxHAVE_WIN32_MB2WC
2583
2584
2585 // ============================================================================
2586 // wxEncodingConverter based conversion classes
2587 // ============================================================================
2588
2589 #if wxUSE_FONTMAP
2590
2591 class wxMBConv_wxwin : public wxMBConv
2592 {
2593 private:
2594 void Init()
2595 {
2596 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2597 // The wxMBConv_cf class does a better job.
2598 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2599 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2600 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2601 }
2602
2603 public:
2604 // temporarily just use wxEncodingConverter stuff,
2605 // so that it works while a better implementation is built
2606 wxMBConv_wxwin(const char* name)
2607 {
2608 if (name)
2609 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2610 else
2611 m_enc = wxFONTENCODING_SYSTEM;
2612
2613 Init();
2614 }
2615
2616 wxMBConv_wxwin(wxFontEncoding enc)
2617 {
2618 m_enc = enc;
2619
2620 Init();
2621 }
2622
2623 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2624 {
2625 size_t inbuf = strlen(psz);
2626 if (buf)
2627 {
2628 if (!m2w.Convert(psz, buf))
2629 return wxCONV_FAILED;
2630 }
2631 return inbuf;
2632 }
2633
2634 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2635 {
2636 const size_t inbuf = wxWcslen(psz);
2637 if (buf)
2638 {
2639 if (!w2m.Convert(psz, buf))
2640 return wxCONV_FAILED;
2641 }
2642
2643 return inbuf;
2644 }
2645
2646 virtual size_t GetMBNulLen() const
2647 {
2648 switch ( m_enc )
2649 {
2650 case wxFONTENCODING_UTF16BE:
2651 case wxFONTENCODING_UTF16LE:
2652 return 2;
2653
2654 case wxFONTENCODING_UTF32BE:
2655 case wxFONTENCODING_UTF32LE:
2656 return 4;
2657
2658 default:
2659 return 1;
2660 }
2661 }
2662
2663 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2664
2665 bool IsOk() const { return m_ok; }
2666
2667 public:
2668 wxFontEncoding m_enc;
2669 wxEncodingConverter m2w, w2m;
2670
2671 private:
2672 // were we initialized successfully?
2673 bool m_ok;
2674
2675 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2676 };
2677
2678 // make the constructors available for unit testing
2679 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2680 {
2681 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2682 if ( !result->IsOk() )
2683 {
2684 delete result;
2685 return 0;
2686 }
2687
2688 return result;
2689 }
2690
2691 #endif // wxUSE_FONTMAP
2692
2693 // ============================================================================
2694 // wxCSConv implementation
2695 // ============================================================================
2696
2697 void wxCSConv::Init()
2698 {
2699 m_name = NULL;
2700 m_convReal = NULL;
2701 m_deferred = true;
2702 }
2703
2704 wxCSConv::wxCSConv(const wxString& charset)
2705 {
2706 Init();
2707
2708 if ( !charset.empty() )
2709 {
2710 SetName(charset.ToAscii());
2711 }
2712
2713 #if wxUSE_FONTMAP
2714 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2715 #else
2716 m_encoding = wxFONTENCODING_SYSTEM;
2717 #endif
2718 }
2719
2720 wxCSConv::wxCSConv(wxFontEncoding encoding)
2721 {
2722 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2723 {
2724 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2725
2726 encoding = wxFONTENCODING_SYSTEM;
2727 }
2728
2729 Init();
2730
2731 m_encoding = encoding;
2732 }
2733
2734 wxCSConv::~wxCSConv()
2735 {
2736 Clear();
2737 }
2738
2739 wxCSConv::wxCSConv(const wxCSConv& conv)
2740 : wxMBConv()
2741 {
2742 Init();
2743
2744 SetName(conv.m_name);
2745 m_encoding = conv.m_encoding;
2746 }
2747
2748 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2749 {
2750 Clear();
2751
2752 SetName(conv.m_name);
2753 m_encoding = conv.m_encoding;
2754
2755 return *this;
2756 }
2757
2758 void wxCSConv::Clear()
2759 {
2760 free(m_name);
2761 delete m_convReal;
2762
2763 m_name = NULL;
2764 m_convReal = NULL;
2765 }
2766
2767 void wxCSConv::SetName(const char *charset)
2768 {
2769 if (charset)
2770 {
2771 m_name = wxStrdup(charset);
2772 m_deferred = true;
2773 }
2774 }
2775
2776 #if wxUSE_FONTMAP
2777
2778 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2779 wxEncodingNameCache );
2780
2781 static wxEncodingNameCache gs_nameCache;
2782 #endif
2783
2784 wxMBConv *wxCSConv::DoCreate() const
2785 {
2786 #if wxUSE_FONTMAP
2787 wxLogTrace(TRACE_STRCONV,
2788 wxT("creating conversion for %s"),
2789 (m_name ? m_name
2790 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2791 #endif // wxUSE_FONTMAP
2792
2793 // check for the special case of ASCII or ISO8859-1 charset: as we have
2794 // special knowledge of it anyhow, we don't need to create a special
2795 // conversion object
2796 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2797 m_encoding == wxFONTENCODING_DEFAULT )
2798 {
2799 // don't convert at all
2800 return NULL;
2801 }
2802
2803 // we trust OS to do conversion better than we can so try external
2804 // conversion methods first
2805 //
2806 // the full order is:
2807 // 1. OS conversion (iconv() under Unix or Win32 API)
2808 // 2. hard coded conversions for UTF
2809 // 3. wxEncodingConverter as fall back
2810
2811 // step (1)
2812 #ifdef HAVE_ICONV
2813 #if !wxUSE_FONTMAP
2814 if ( m_name )
2815 #endif // !wxUSE_FONTMAP
2816 {
2817 #if wxUSE_FONTMAP
2818 wxFontEncoding encoding(m_encoding);
2819 #endif
2820
2821 if ( m_name )
2822 {
2823 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2824 if ( conv->IsOk() )
2825 return conv;
2826
2827 delete conv;
2828
2829 #if wxUSE_FONTMAP
2830 encoding =
2831 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2832 #endif // wxUSE_FONTMAP
2833 }
2834 #if wxUSE_FONTMAP
2835 {
2836 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2837 if ( it != gs_nameCache.end() )
2838 {
2839 if ( it->second.empty() )
2840 return NULL;
2841
2842 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2843 if ( conv->IsOk() )
2844 return conv;
2845
2846 delete conv;
2847 }
2848
2849 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2850 // CS : in case this does not return valid names (eg for MacRoman)
2851 // encoding got a 'failure' entry in the cache all the same,
2852 // although it just has to be created using a different method, so
2853 // only store failed iconv creation attempts (or perhaps we
2854 // shoulnd't do this at all ?)
2855 if ( names[0] != NULL )
2856 {
2857 for ( ; *names; ++names )
2858 {
2859 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2860 // will need changes that will obsolete this
2861 wxString name(*names);
2862 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2863 if ( conv->IsOk() )
2864 {
2865 gs_nameCache[encoding] = *names;
2866 return conv;
2867 }
2868
2869 delete conv;
2870 }
2871
2872 gs_nameCache[encoding] = _T(""); // cache the failure
2873 }
2874 }
2875 #endif // wxUSE_FONTMAP
2876 }
2877 #endif // HAVE_ICONV
2878
2879 #ifdef wxHAVE_WIN32_MB2WC
2880 {
2881 #if wxUSE_FONTMAP
2882 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2883 : new wxMBConv_win32(m_encoding);
2884 if ( conv->IsOk() )
2885 return conv;
2886
2887 delete conv;
2888 #else
2889 return NULL;
2890 #endif
2891 }
2892 #endif // wxHAVE_WIN32_MB2WC
2893
2894 #ifdef __DARWIN__
2895 {
2896 // leave UTF16 and UTF32 to the built-ins of wx
2897 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2898 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2899 {
2900 #if wxUSE_FONTMAP
2901 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2902 : new wxMBConv_cf(m_encoding);
2903 #else
2904 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2905 #endif
2906
2907 if ( conv->IsOk() )
2908 return conv;
2909
2910 delete conv;
2911 }
2912 }
2913 #endif // __DARWIN__
2914
2915 // step (2)
2916 wxFontEncoding enc = m_encoding;
2917 #if wxUSE_FONTMAP
2918 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2919 {
2920 // use "false" to suppress interactive dialogs -- we can be called from
2921 // anywhere and popping up a dialog from here is the last thing we want to
2922 // do
2923 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2924 }
2925 #endif // wxUSE_FONTMAP
2926
2927 switch ( enc )
2928 {
2929 case wxFONTENCODING_UTF7:
2930 return new wxMBConvUTF7;
2931
2932 case wxFONTENCODING_UTF8:
2933 return new wxMBConvUTF8;
2934
2935 case wxFONTENCODING_UTF16BE:
2936 return new wxMBConvUTF16BE;
2937
2938 case wxFONTENCODING_UTF16LE:
2939 return new wxMBConvUTF16LE;
2940
2941 case wxFONTENCODING_UTF32BE:
2942 return new wxMBConvUTF32BE;
2943
2944 case wxFONTENCODING_UTF32LE:
2945 return new wxMBConvUTF32LE;
2946
2947 default:
2948 // nothing to do but put here to suppress gcc warnings
2949 break;
2950 }
2951
2952 // step (3)
2953 #if wxUSE_FONTMAP
2954 {
2955 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2956 : new wxMBConv_wxwin(m_encoding);
2957 if ( conv->IsOk() )
2958 return conv;
2959
2960 delete conv;
2961 }
2962 #endif // wxUSE_FONTMAP
2963
2964 // NB: This is a hack to prevent deadlock. What could otherwise happen
2965 // in Unicode build: wxConvLocal creation ends up being here
2966 // because of some failure and logs the error. But wxLog will try to
2967 // attach a timestamp, for which it will need wxConvLocal (to convert
2968 // time to char* and then wchar_t*), but that fails, tries to log the
2969 // error, but wxLog has an (already locked) critical section that
2970 // guards the static buffer.
2971 static bool alreadyLoggingError = false;
2972 if (!alreadyLoggingError)
2973 {
2974 alreadyLoggingError = true;
2975 wxLogError(_("Cannot convert from the charset '%s'!"),
2976 m_name ? m_name
2977 :
2978 #if wxUSE_FONTMAP
2979 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2980 #else // !wxUSE_FONTMAP
2981 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2982 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2983 );
2984
2985 alreadyLoggingError = false;
2986 }
2987
2988 return NULL;
2989 }
2990
2991 void wxCSConv::CreateConvIfNeeded() const
2992 {
2993 if ( m_deferred )
2994 {
2995 wxCSConv *self = (wxCSConv *)this; // const_cast
2996
2997 // if we don't have neither the name nor the encoding, use the default
2998 // encoding for this system
2999 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3000 {
3001 #if wxUSE_INTL
3002 self->m_encoding = wxLocale::GetSystemEncoding();
3003 #else
3004 // fallback to some reasonable default:
3005 self->m_encoding = wxFONTENCODING_ISO8859_1;
3006 #endif // wxUSE_INTL
3007 }
3008
3009 self->m_convReal = DoCreate();
3010 self->m_deferred = false;
3011 }
3012 }
3013
3014 bool wxCSConv::IsOk() const
3015 {
3016 CreateConvIfNeeded();
3017
3018 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3019 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3020 return true; // always ok as we do it ourselves
3021
3022 // m_convReal->IsOk() is called at its own creation, so we know it must
3023 // be ok if m_convReal is non-NULL
3024 return m_convReal != NULL;
3025 }
3026
3027 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3028 const char *src, size_t srcLen) const
3029 {
3030 CreateConvIfNeeded();
3031
3032 if (m_convReal)
3033 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3034
3035 // latin-1 (direct)
3036 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3037 }
3038
3039 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3040 const wchar_t *src, size_t srcLen) const
3041 {
3042 CreateConvIfNeeded();
3043
3044 if (m_convReal)
3045 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3046
3047 // latin-1 (direct)
3048 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3049 }
3050
3051 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3052 {
3053 CreateConvIfNeeded();
3054
3055 if (m_convReal)
3056 return m_convReal->MB2WC(buf, psz, n);
3057
3058 // latin-1 (direct)
3059 size_t len = strlen(psz);
3060
3061 if (buf)
3062 {
3063 for (size_t c = 0; c <= len; c++)
3064 buf[c] = (unsigned char)(psz[c]);
3065 }
3066
3067 return len;
3068 }
3069
3070 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3071 {
3072 CreateConvIfNeeded();
3073
3074 if (m_convReal)
3075 return m_convReal->WC2MB(buf, psz, n);
3076
3077 // latin-1 (direct)
3078 const size_t len = wxWcslen(psz);
3079 if (buf)
3080 {
3081 for (size_t c = 0; c <= len; c++)
3082 {
3083 if (psz[c] > 0xFF)
3084 return wxCONV_FAILED;
3085
3086 buf[c] = (char)psz[c];
3087 }
3088 }
3089 else
3090 {
3091 for (size_t c = 0; c <= len; c++)
3092 {
3093 if (psz[c] > 0xFF)
3094 return wxCONV_FAILED;
3095 }
3096 }
3097
3098 return len;
3099 }
3100
3101 size_t wxCSConv::GetMBNulLen() const
3102 {
3103 CreateConvIfNeeded();
3104
3105 if ( m_convReal )
3106 {
3107 return m_convReal->GetMBNulLen();
3108 }
3109
3110 // otherwise, we are ISO-8859-1
3111 return 1;
3112 }
3113
3114 #if wxUSE_UNICODE_UTF8
3115 bool wxCSConv::IsUTF8() const
3116 {
3117 CreateConvIfNeeded();
3118
3119 if ( m_convReal )
3120 {
3121 return m_convReal->IsUTF8();
3122 }
3123
3124 // otherwise, we are ISO-8859-1
3125 return false;
3126 }
3127 #endif
3128
3129
3130 #if wxUSE_UNICODE
3131
3132 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3133 {
3134 if ( !s )
3135 return wxWCharBuffer();
3136
3137 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3138 if ( !wbuf )
3139 wbuf = wxMBConvUTF8().cMB2WX(s);
3140 if ( !wbuf )
3141 wbuf = wxConvISO8859_1.cMB2WX(s);
3142
3143 return wbuf;
3144 }
3145
3146 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3147 {
3148 if ( !ws )
3149 return wxCharBuffer();
3150
3151 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3152 if ( !buf )
3153 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3154
3155 return buf;
3156 }
3157
3158 #endif // wxUSE_UNICODE
3159
3160 // ----------------------------------------------------------------------------
3161 // globals
3162 // ----------------------------------------------------------------------------
3163
3164 // NB: The reason why we create converted objects in this convoluted way,
3165 // using a factory function instead of global variable, is that they
3166 // may be used at static initialization time (some of them are used by
3167 // wxString ctors and there may be a global wxString object). In other
3168 // words, possibly _before_ the converter global object would be
3169 // initialized.
3170
3171 #undef wxConvLibc
3172 #undef wxConvUTF8
3173 #undef wxConvUTF7
3174 #undef wxConvLocal
3175 #undef wxConvISO8859_1
3176
3177 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3178 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3179 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3180 { \
3181 static impl_klass name##Obj ctor_args; \
3182 return &name##Obj; \
3183 } \
3184 /* this ensures that all global converter objects are created */ \
3185 /* by the time static initialization is done, i.e. before any */ \
3186 /* thread is launched: */ \
3187 static klass* gs_##name##instance = wxGet_##name##Ptr()
3188
3189 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3190 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3191
3192 #ifdef __WINDOWS__
3193 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3194 #else
3195 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3196 #endif
3197
3198 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3199 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3200 // provokes an error message about "not enough macro parameters"; and we
3201 // can't use "()" here as the name##Obj declaration would be parsed as a
3202 // function declaration then, so use a semicolon and live with an extra
3203 // empty statement (and hope that no compilers warns about this)
3204 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3205 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3206
3207 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3208 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3209
3210 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3211 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3212
3213 #ifdef __DARWIN__
3214 // The xnu kernel always communicates file paths in decomposed UTF-8.
3215 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3216 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3217 #endif
3218
3219 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3220 #ifdef __DARWIN__
3221 &wxConvMacUTF8DObj;
3222 #else // !__DARWIN__
3223 wxGet_wxConvLibcPtr();
3224 #endif // __DARWIN__/!__DARWIN__
3225
3226 #else // !wxUSE_WCHAR_T
3227
3228 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3229 // stand-ins in absence of wchar_t
3230 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3231 wxConvISO8859_1,
3232 wxConvLocal,
3233 wxConvUTF8;
3234
3235 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T