fix wxMBConvUTF8::cMB2WC/cWC2MB() broken by the introduction of wxMBConvStrictUTF8...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef __SALFORDC__
48 #include <clib.h>
49 #endif
50
51 #ifdef HAVE_ICONV
52 #include <iconv.h>
53 #include "wx/thread.h"
54 #endif
55
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
58
59 #ifdef __DARWIN__
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
62
63
64 #define TRACE_STRCONV _T("strconv")
65
66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
67 // be 4 bytes
68 #if SIZEOF_WCHAR_T == 2
69 #define WC_UTF16
70 #endif
71
72
73 // ============================================================================
74 // implementation
75 // ============================================================================
76
77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
78 static bool NotAllNULs(const char *p, size_t n)
79 {
80 while ( n && *p++ == '\0' )
81 n--;
82
83 return n != 0;
84 }
85
86 // ----------------------------------------------------------------------------
87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
88 // ----------------------------------------------------------------------------
89
90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
91 {
92 if (input <= 0xffff)
93 {
94 if (output)
95 *output = (wxUint16) input;
96
97 return 1;
98 }
99 else if (input >= 0x110000)
100 {
101 return wxCONV_FAILED;
102 }
103 else
104 {
105 if (output)
106 {
107 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
108 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
109 }
110
111 return 2;
112 }
113 }
114
115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
116 {
117 if ((*input < 0xd800) || (*input > 0xdfff))
118 {
119 output = *input;
120 return 1;
121 }
122 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
123 {
124 output = *input;
125 return wxCONV_FAILED;
126 }
127 else
128 {
129 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
130 return 2;
131 }
132 }
133
134 #ifdef WC_UTF16
135 typedef wchar_t wxDecodeSurrogate_t;
136 #else // !WC_UTF16
137 typedef wxUint16 wxDecodeSurrogate_t;
138 #endif // WC_UTF16/!WC_UTF16
139
140 // returns the next UTF-32 character from the wchar_t buffer and advances the
141 // pointer to the character after this one
142 //
143 // if an invalid character is found, *pSrc is set to NULL, the caller must
144 // check for this
145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
146 {
147 wxUint32 out;
148 const size_t
149 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
150 if ( n == wxCONV_FAILED )
151 *pSrc = NULL;
152 else
153 *pSrc += n;
154
155 return out;
156 }
157
158 // ----------------------------------------------------------------------------
159 // wxMBConv
160 // ----------------------------------------------------------------------------
161
162 size_t
163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
164 const char *src, size_t srcLen) const
165 {
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 for ( ;; )
213 {
214 // try to convert the current chunk
215 size_t lenChunk = MB2WC(NULL, src, 0);
216 if ( lenChunk == wxCONV_FAILED )
217 return wxCONV_FAILED;
218
219 lenChunk++; // for the L'\0' at the end of this chunk
220
221 dstWritten += lenChunk;
222
223 if ( lenChunk == 1 )
224 {
225 // nothing left in the input string, conversion succeeded
226 break;
227 }
228
229 if ( dst )
230 {
231 if ( dstWritten > dstLen )
232 return wxCONV_FAILED;
233
234 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
235 return wxCONV_FAILED;
236
237 dst += lenChunk;
238 }
239
240 if ( !srcEnd )
241 {
242 // we convert just one chunk in this case as this is the entire
243 // string anyhow
244 break;
245 }
246
247 // advance the input pointer past the end of this chunk
248 while ( NotAllNULs(src, nulLen) )
249 {
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
254 src += nulLen;
255 }
256
257 src += nulLen; // skipping over its terminator as well
258
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
262 if ( src >= srcEnd )
263 break;
264 }
265
266 return dstWritten;
267 }
268
269 size_t
270 wxMBConv::FromWChar(char *dst, size_t dstLen,
271 const wchar_t *src, size_t srcLen) const
272 {
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten = 0;
275
276 // make a copy of the input string unless it is already properly
277 // NUL-terminated
278 //
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp;
282 if ( srcLen == wxNO_LEN )
283 {
284 srcLen = wxWcslen(src) + 1;
285 }
286 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
287 {
288 // make a copy in order to properly NUL-terminate the string
289 bufTmp = wxWCharBuffer(srcLen);
290 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
291 src = bufTmp;
292 }
293
294 const size_t lenNul = GetMBNulLen();
295 for ( const wchar_t * const srcEnd = src + srcLen;
296 src < srcEnd;
297 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
298 {
299 // try to convert the current chunk
300 size_t lenChunk = WC2MB(NULL, src, 0);
301
302 if ( lenChunk == wxCONV_FAILED )
303 return wxCONV_FAILED;
304
305 lenChunk += lenNul;
306 dstWritten += lenChunk;
307
308 if ( dst )
309 {
310 if ( dstWritten > dstLen )
311 return wxCONV_FAILED;
312
313 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dst += lenChunk;
317 }
318 }
319
320 return dstWritten;
321 }
322
323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
324 {
325 size_t rc = ToWChar(outBuff, outLen, inBuff);
326 if ( rc != wxCONV_FAILED )
327 {
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
330 rc--;
331 }
332
333 return rc;
334 }
335
336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
337 {
338 size_t rc = FromWChar(outBuff, outLen, inBuff);
339 if ( rc != wxCONV_FAILED )
340 {
341 rc -= GetMBNulLen();
342 }
343
344 return rc;
345 }
346
347 wxMBConv::~wxMBConv()
348 {
349 // nothing to do here (necessary for Darwin linking probably)
350 }
351
352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
353 {
354 if ( psz )
355 {
356 // calculate the length of the buffer needed first
357 const size_t nLen = ToWChar(NULL, 0, psz);
358 if ( nLen != wxCONV_FAILED )
359 {
360 // now do the actual conversion
361 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
362
363 // +1 for the trailing NULL
364 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
365 return buf;
366 }
367 }
368
369 return wxWCharBuffer();
370 }
371
372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
373 {
374 if ( pwz )
375 {
376 const size_t nLen = FromWChar(NULL, 0, pwz);
377 if ( nLen != wxCONV_FAILED )
378 {
379 wxCharBuffer buf(nLen - 1);
380 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
381 return buf;
382 }
383 }
384
385 return wxCharBuffer();
386 }
387
388 const wxWCharBuffer
389 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
390 {
391 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
392 if ( dstLen != wxCONV_FAILED )
393 {
394 wxWCharBuffer wbuf(dstLen - 1);
395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
396 {
397 if ( outLen )
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412 }
413
414 const wxCharBuffer
415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
416 {
417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
418 if ( dstLen != wxCONV_FAILED )
419 {
420 // special case of empty input: can't allocate 0 size buffer below as
421 // wxCharBuffer insists on NUL-terminating it
422 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
423 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
424 {
425 if ( outLen )
426 {
427 *outLen = dstLen;
428
429 const size_t nulLen = GetMBNulLen();
430 if ( dstLen >= nulLen &&
431 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
432 {
433 // in this case the output is NUL-terminated and we're not
434 // supposed to count NUL
435 *outLen -= nulLen;
436 }
437 }
438
439 return buf;
440 }
441 }
442
443 if ( outLen )
444 *outLen = 0;
445
446 return wxCharBuffer();
447 }
448
449 // ----------------------------------------------------------------------------
450 // wxMBConvLibc
451 // ----------------------------------------------------------------------------
452
453 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
454 {
455 return wxMB2WC(buf, psz, n);
456 }
457
458 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
459 {
460 return wxWC2MB(buf, psz, n);
461 }
462
463 // ----------------------------------------------------------------------------
464 // wxConvBrokenFileNames
465 // ----------------------------------------------------------------------------
466
467 #ifdef __UNIX__
468
469 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
470 {
471 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
472 wxStricmp(charset, _T("UTF8")) == 0 )
473 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
474 else
475 m_conv = new wxCSConv(charset);
476 }
477
478 #endif // __UNIX__
479
480 // ----------------------------------------------------------------------------
481 // UTF-7
482 // ----------------------------------------------------------------------------
483
484 // Implementation (C) 2004 Fredrik Roubert
485
486 //
487 // BASE64 decoding table
488 //
489 static const unsigned char utf7unb64[] =
490 {
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
497 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
498 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
500 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
501 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
502 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
504 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
505 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
506 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
523 };
524
525 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
526 {
527 size_t len = 0;
528
529 while ( *psz && (!buf || (len < n)) )
530 {
531 unsigned char cc = *psz++;
532 if (cc != '+')
533 {
534 // plain ASCII char
535 if (buf)
536 *buf++ = cc;
537 len++;
538 }
539 else if (*psz == '-')
540 {
541 // encoded plus sign
542 if (buf)
543 *buf++ = cc;
544 len++;
545 psz++;
546 }
547 else // start of BASE64 encoded string
548 {
549 bool lsb, ok;
550 unsigned int d, l;
551 for ( ok = lsb = false, d = 0, l = 0;
552 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
553 psz++ )
554 {
555 d <<= 6;
556 d += cc;
557 for (l += 6; l >= 8; lsb = !lsb)
558 {
559 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
560 if (lsb)
561 {
562 if (buf)
563 *buf++ |= c;
564 len ++;
565 }
566 else
567 {
568 if (buf)
569 *buf = (wchar_t)(c << 8);
570 }
571
572 ok = true;
573 }
574 }
575
576 if ( !ok )
577 {
578 // in valid UTF7 we should have valid characters after '+'
579 return wxCONV_FAILED;
580 }
581
582 if (*psz == '-')
583 psz++;
584 }
585 }
586
587 if ( buf && (len < n) )
588 *buf = '\0';
589
590 return len;
591 }
592
593 //
594 // BASE64 encoding table
595 //
596 static const unsigned char utf7enb64[] =
597 {
598 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
599 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
600 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
601 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
602 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
603 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
604 'w', 'x', 'y', 'z', '0', '1', '2', '3',
605 '4', '5', '6', '7', '8', '9', '+', '/'
606 };
607
608 //
609 // UTF-7 encoding table
610 //
611 // 0 - Set D (directly encoded characters)
612 // 1 - Set O (optional direct characters)
613 // 2 - whitespace characters (optional)
614 // 3 - special characters
615 //
616 static const unsigned char utf7encode[128] =
617 {
618 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
619 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
620 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
622 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
624 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
626 };
627
628 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
629 {
630 size_t len = 0;
631
632 while (*psz && ((!buf) || (len < n)))
633 {
634 wchar_t cc = *psz++;
635 if (cc < 0x80 && utf7encode[cc] < 1)
636 {
637 // plain ASCII char
638 if (buf)
639 *buf++ = (char)cc;
640
641 len++;
642 }
643 #ifndef WC_UTF16
644 else if (((wxUint32)cc) > 0xffff)
645 {
646 // no surrogate pair generation (yet?)
647 return wxCONV_FAILED;
648 }
649 #endif
650 else
651 {
652 if (buf)
653 *buf++ = '+';
654
655 len++;
656 if (cc != '+')
657 {
658 // BASE64 encode string
659 unsigned int lsb, d, l;
660 for (d = 0, l = 0; /*nothing*/; psz++)
661 {
662 for (lsb = 0; lsb < 2; lsb ++)
663 {
664 d <<= 8;
665 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
666
667 for (l += 8; l >= 6; )
668 {
669 l -= 6;
670 if (buf)
671 *buf++ = utf7enb64[(d >> l) % 64];
672 len++;
673 }
674 }
675
676 cc = *psz;
677 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
678 break;
679 }
680
681 if (l != 0)
682 {
683 if (buf)
684 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
685
686 len++;
687 }
688 }
689
690 if (buf)
691 *buf++ = '-';
692 len++;
693 }
694 }
695
696 if (buf && (len < n))
697 *buf = 0;
698
699 return len;
700 }
701
702 // ----------------------------------------------------------------------------
703 // UTF-8
704 // ----------------------------------------------------------------------------
705
706 static const wxUint32 utf8_max[]=
707 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
708
709 // boundaries of the private use area we use to (temporarily) remap invalid
710 // characters invalid in a UTF-8 encoded string
711 const wxUint32 wxUnicodePUA = 0x100000;
712 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
713
714 // this table gives the length of the UTF-8 encoding from its first character:
715 const unsigned char tableUtf8Lengths[256] = {
716 // single-byte sequences (ASCII):
717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
725
726 // these are invalid:
727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
731 0, 0, // C0,C1
732
733 // two-byte sequences:
734 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
735 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
736
737 // three-byte sequences:
738 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
739
740 // four-byte sequences:
741 4, 4, 4, 4, 4, // F0..F4
742
743 // these are invalid again (5- or 6-byte
744 // sequences and sequences for code points
745 // above U+10FFFF, as restricted by RFC 3629):
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
747 };
748
749 size_t
750 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
751 const char *src, size_t srcLen) const
752 {
753 wchar_t *out = dstLen ? dst : NULL;
754 size_t written = 0;
755
756 if ( srcLen == wxNO_LEN )
757 srcLen = strlen(src) + 1;
758
759 for ( const char *p = src; ; p++ )
760 {
761 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
762 {
763 // all done successfully, just add the trailing NULL if we are not
764 // using explicit length
765 if ( srcLen == wxNO_LEN )
766 {
767 if ( out )
768 {
769 if ( !dstLen )
770 break;
771
772 *out = L'\0';
773 }
774
775 written++;
776 }
777
778 return written;
779 }
780
781 if ( out && !dstLen-- )
782 break;
783
784 wxUint32 code;
785 unsigned char c = *p;
786
787 if ( c < 0x80 )
788 {
789 if ( srcLen == 0 ) // the test works for wxNO_LEN too
790 break;
791
792 if ( srcLen != wxNO_LEN )
793 srcLen--;
794
795 code = c;
796 }
797 else
798 {
799 unsigned len = tableUtf8Lengths[c];
800 if ( !len )
801 break;
802
803 if ( srcLen < len ) // the test works for wxNO_LEN too
804 break;
805
806 if ( srcLen != wxNO_LEN )
807 srcLen -= len;
808
809 // Char. number range | UTF-8 octet sequence
810 // (hexadecimal) | (binary)
811 // ----------------------+----------------------------------------
812 // 0000 0000 - 0000 007F | 0xxxxxxx
813 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
814 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
815 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
816 //
817 // Code point value is stored in bits marked with 'x',
818 // lowest-order bit of the value on the right side in the diagram
819 // above. (from RFC 3629)
820
821 // mask to extract lead byte's value ('x' bits above), by sequence
822 // length:
823 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
824
825 // mask and value of lead byte's most significant bits, by length:
826 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
827 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
828
829 len--; // it's more convenient to work with 0-based length here
830
831 // extract the lead byte's value bits:
832 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
833 break;
834
835 code = c & leadValueMask[len];
836
837 // all remaining bytes, if any, are handled in the same way
838 // regardless of sequence's length:
839 for ( ; len; --len )
840 {
841 c = *++p;
842 if ( (c & 0xC0) != 0x80 )
843 return wxCONV_FAILED;
844
845 code <<= 6;
846 code |= c & 0x3F;
847 }
848 }
849
850 #ifdef WC_UTF16
851 // cast is ok because wchar_t == wxUint16 if WC_UTF16
852 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
853 {
854 if ( out )
855 out++;
856 written++;
857 }
858 #else // !WC_UTF16
859 if ( out )
860 *out = code;
861 #endif // WC_UTF16/!WC_UTF16
862
863 if ( out )
864 out++;
865
866 written++;
867 }
868
869 return wxCONV_FAILED;
870 }
871
872 size_t
873 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
874 const wchar_t *src, size_t srcLen) const
875 {
876 char *out = dstLen ? dst : NULL;
877 size_t written = 0;
878
879 for ( const wchar_t *wp = src; ; wp++ )
880 {
881 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
882 {
883 // all done successfully, just add the trailing NULL if we are not
884 // using explicit length
885 if ( srcLen == wxNO_LEN )
886 {
887 if ( out )
888 {
889 if ( !dstLen )
890 break;
891
892 *out = '\0';
893 }
894
895 written++;
896 }
897
898 return written;
899 }
900
901
902 wxUint32 code;
903 #ifdef WC_UTF16
904 // cast is ok for WC_UTF16
905 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
906 {
907 // skip the next char too as we decoded a surrogate
908 wp++;
909 }
910 #else // wchar_t is UTF-32
911 code = *wp & 0x7fffffff;
912 #endif
913
914 unsigned len;
915 if ( code <= 0x7F )
916 {
917 len = 1;
918 if ( out )
919 {
920 if ( dstLen < len )
921 break;
922
923 out[0] = (char)code;
924 }
925 }
926 else if ( code <= 0x07FF )
927 {
928 len = 2;
929 if ( out )
930 {
931 if ( dstLen < len )
932 break;
933
934 // NB: this line takes 6 least significant bits, encodes them as
935 // 10xxxxxx and discards them so that the next byte can be encoded:
936 out[1] = 0x80 | (code & 0x3F); code >>= 6;
937 out[0] = 0xC0 | code;
938 }
939 }
940 else if ( code < 0xFFFF )
941 {
942 len = 3;
943 if ( out )
944 {
945 if ( dstLen < len )
946 break;
947
948 out[2] = 0x80 | (code & 0x3F); code >>= 6;
949 out[1] = 0x80 | (code & 0x3F); code >>= 6;
950 out[0] = 0xE0 | code;
951 }
952 }
953 else if ( code <= 0x10FFFF )
954 {
955 len = 4;
956 if ( out )
957 {
958 if ( dstLen < len )
959 break;
960
961 out[3] = 0x80 | (code & 0x3F); code >>= 6;
962 out[2] = 0x80 | (code & 0x3F); code >>= 6;
963 out[1] = 0x80 | (code & 0x3F); code >>= 6;
964 out[0] = 0xF0 | code;
965 }
966 }
967 else
968 {
969 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
970 break;
971 }
972
973 if ( out )
974 {
975 out += len;
976 dstLen -= len;
977 }
978
979 written += len;
980 }
981
982 // we only get here if an error occurs during decoding
983 return wxCONV_FAILED;
984 }
985
986 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
987 const char *psz, size_t srcLen) const
988 {
989 if ( m_options == MAP_INVALID_UTF8_NOT )
990 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
991
992 size_t len = 0;
993
994 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
995 {
996 const char *opsz = psz;
997 bool invalid = false;
998 unsigned char cc = *psz++, fc = cc;
999 unsigned cnt;
1000 for (cnt = 0; fc & 0x80; cnt++)
1001 fc <<= 1;
1002
1003 if (!cnt)
1004 {
1005 // plain ASCII char
1006 if (buf)
1007 *buf++ = cc;
1008 len++;
1009
1010 // escape the escape character for octal escapes
1011 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1012 && cc == '\\' && (!buf || len < n))
1013 {
1014 if (buf)
1015 *buf++ = cc;
1016 len++;
1017 }
1018 }
1019 else
1020 {
1021 cnt--;
1022 if (!cnt)
1023 {
1024 // invalid UTF-8 sequence
1025 invalid = true;
1026 }
1027 else
1028 {
1029 unsigned ocnt = cnt - 1;
1030 wxUint32 res = cc & (0x3f >> cnt);
1031 while (cnt--)
1032 {
1033 cc = *psz;
1034 if ((cc & 0xC0) != 0x80)
1035 {
1036 // invalid UTF-8 sequence
1037 invalid = true;
1038 break;
1039 }
1040
1041 psz++;
1042 res = (res << 6) | (cc & 0x3f);
1043 }
1044
1045 if (invalid || res <= utf8_max[ocnt])
1046 {
1047 // illegal UTF-8 encoding
1048 invalid = true;
1049 }
1050 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1051 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1052 {
1053 // if one of our PUA characters turns up externally
1054 // it must also be treated as an illegal sequence
1055 // (a bit like you have to escape an escape character)
1056 invalid = true;
1057 }
1058 else
1059 {
1060 #ifdef WC_UTF16
1061 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1062 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1063 if (pa == wxCONV_FAILED)
1064 {
1065 invalid = true;
1066 }
1067 else
1068 {
1069 if (buf)
1070 buf += pa;
1071 len += pa;
1072 }
1073 #else // !WC_UTF16
1074 if (buf)
1075 *buf++ = (wchar_t)res;
1076 len++;
1077 #endif // WC_UTF16/!WC_UTF16
1078 }
1079 }
1080
1081 if (invalid)
1082 {
1083 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1084 {
1085 while (opsz < psz && (!buf || len < n))
1086 {
1087 #ifdef WC_UTF16
1088 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1089 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1090 wxASSERT(pa != wxCONV_FAILED);
1091 if (buf)
1092 buf += pa;
1093 opsz++;
1094 len += pa;
1095 #else
1096 if (buf)
1097 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1098 opsz++;
1099 len++;
1100 #endif
1101 }
1102 }
1103 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1104 {
1105 while (opsz < psz && (!buf || len < n))
1106 {
1107 if ( buf && len + 3 < n )
1108 {
1109 unsigned char on = *opsz;
1110 *buf++ = L'\\';
1111 *buf++ = (wchar_t)( L'0' + on / 0100 );
1112 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1113 *buf++ = (wchar_t)( L'0' + on % 010 );
1114 }
1115
1116 opsz++;
1117 len += 4;
1118 }
1119 }
1120 else // MAP_INVALID_UTF8_NOT
1121 {
1122 return wxCONV_FAILED;
1123 }
1124 }
1125 }
1126 }
1127
1128 if (srcLen == wxNO_LEN && buf && (len < n))
1129 *buf = 0;
1130
1131 return len + 1;
1132 }
1133
1134 static inline bool isoctal(wchar_t wch)
1135 {
1136 return L'0' <= wch && wch <= L'7';
1137 }
1138
1139 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1140 const wchar_t *psz, size_t srcLen) const
1141 {
1142 if ( m_options == MAP_INVALID_UTF8_NOT )
1143 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1144
1145 size_t len = 0;
1146
1147 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1148 {
1149 wxUint32 cc;
1150
1151 #ifdef WC_UTF16
1152 // cast is ok for WC_UTF16
1153 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1154 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1155 #else
1156 cc = (*psz++) & 0x7fffffff;
1157 #endif
1158
1159 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1160 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1161 {
1162 if (buf)
1163 *buf++ = (char)(cc - wxUnicodePUA);
1164 len++;
1165 }
1166 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1167 && cc == L'\\' && psz[0] == L'\\' )
1168 {
1169 if (buf)
1170 *buf++ = (char)cc;
1171 psz++;
1172 len++;
1173 }
1174 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1175 cc == L'\\' &&
1176 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1177 {
1178 if (buf)
1179 {
1180 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1181 (psz[1] - L'0') * 010 +
1182 (psz[2] - L'0'));
1183 }
1184
1185 psz += 3;
1186 len++;
1187 }
1188 else
1189 {
1190 unsigned cnt;
1191 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1192 {
1193 }
1194
1195 if (!cnt)
1196 {
1197 // plain ASCII char
1198 if (buf)
1199 *buf++ = (char) cc;
1200 len++;
1201 }
1202 else
1203 {
1204 len += cnt + 1;
1205 if (buf)
1206 {
1207 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1208 while (cnt--)
1209 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1210 }
1211 }
1212 }
1213 }
1214
1215 if (srcLen == wxNO_LEN && buf && (len < n))
1216 *buf = 0;
1217
1218 return len + 1;
1219 }
1220
1221 // ============================================================================
1222 // UTF-16
1223 // ============================================================================
1224
1225 #ifdef WORDS_BIGENDIAN
1226 #define wxMBConvUTF16straight wxMBConvUTF16BE
1227 #define wxMBConvUTF16swap wxMBConvUTF16LE
1228 #else
1229 #define wxMBConvUTF16swap wxMBConvUTF16BE
1230 #define wxMBConvUTF16straight wxMBConvUTF16LE
1231 #endif
1232
1233 /* static */
1234 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1235 {
1236 if ( srcLen == wxNO_LEN )
1237 {
1238 // count the number of bytes in input, including the trailing NULs
1239 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1240 for ( srcLen = 1; *inBuff++; srcLen++ )
1241 ;
1242
1243 srcLen *= BYTES_PER_CHAR;
1244 }
1245 else // we already have the length
1246 {
1247 // we can only convert an entire number of UTF-16 characters
1248 if ( srcLen % BYTES_PER_CHAR )
1249 return wxCONV_FAILED;
1250 }
1251
1252 return srcLen;
1253 }
1254
1255 // case when in-memory representation is UTF-16 too
1256 #ifdef WC_UTF16
1257
1258 // ----------------------------------------------------------------------------
1259 // conversions without endianness change
1260 // ----------------------------------------------------------------------------
1261
1262 size_t
1263 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1264 const char *src, size_t srcLen) const
1265 {
1266 // set up the scene for using memcpy() (which is presumably more efficient
1267 // than copying the bytes one by one)
1268 srcLen = GetLength(src, srcLen);
1269 if ( srcLen == wxNO_LEN )
1270 return wxCONV_FAILED;
1271
1272 const size_t inLen = srcLen / BYTES_PER_CHAR;
1273 if ( dst )
1274 {
1275 if ( dstLen < inLen )
1276 return wxCONV_FAILED;
1277
1278 memcpy(dst, src, srcLen);
1279 }
1280
1281 return inLen;
1282 }
1283
1284 size_t
1285 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1286 const wchar_t *src, size_t srcLen) const
1287 {
1288 if ( srcLen == wxNO_LEN )
1289 srcLen = wxWcslen(src) + 1;
1290
1291 srcLen *= BYTES_PER_CHAR;
1292
1293 if ( dst )
1294 {
1295 if ( dstLen < srcLen )
1296 return wxCONV_FAILED;
1297
1298 memcpy(dst, src, srcLen);
1299 }
1300
1301 return srcLen;
1302 }
1303
1304 // ----------------------------------------------------------------------------
1305 // endian-reversing conversions
1306 // ----------------------------------------------------------------------------
1307
1308 size_t
1309 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1310 const char *src, size_t srcLen) const
1311 {
1312 srcLen = GetLength(src, srcLen);
1313 if ( srcLen == wxNO_LEN )
1314 return wxCONV_FAILED;
1315
1316 srcLen /= BYTES_PER_CHAR;
1317
1318 if ( dst )
1319 {
1320 if ( dstLen < srcLen )
1321 return wxCONV_FAILED;
1322
1323 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1324 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1325 {
1326 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1327 }
1328 }
1329
1330 return srcLen;
1331 }
1332
1333 size_t
1334 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1335 const wchar_t *src, size_t srcLen) const
1336 {
1337 if ( srcLen == wxNO_LEN )
1338 srcLen = wxWcslen(src) + 1;
1339
1340 srcLen *= BYTES_PER_CHAR;
1341
1342 if ( dst )
1343 {
1344 if ( dstLen < srcLen )
1345 return wxCONV_FAILED;
1346
1347 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1348 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1349 {
1350 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1351 }
1352 }
1353
1354 return srcLen;
1355 }
1356
1357 #else // !WC_UTF16: wchar_t is UTF-32
1358
1359 // ----------------------------------------------------------------------------
1360 // conversions without endianness change
1361 // ----------------------------------------------------------------------------
1362
1363 size_t
1364 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1365 const char *src, size_t srcLen) const
1366 {
1367 srcLen = GetLength(src, srcLen);
1368 if ( srcLen == wxNO_LEN )
1369 return wxCONV_FAILED;
1370
1371 const size_t inLen = srcLen / BYTES_PER_CHAR;
1372 if ( !dst )
1373 {
1374 // optimization: return maximal space which could be needed for this
1375 // string even if the real size could be smaller if the buffer contains
1376 // any surrogates
1377 return inLen;
1378 }
1379
1380 size_t outLen = 0;
1381 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1382 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1383 {
1384 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1385 if ( !inBuff )
1386 return wxCONV_FAILED;
1387
1388 if ( ++outLen > dstLen )
1389 return wxCONV_FAILED;
1390
1391 *dst++ = ch;
1392 }
1393
1394
1395 return outLen;
1396 }
1397
1398 size_t
1399 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1400 const wchar_t *src, size_t srcLen) const
1401 {
1402 if ( srcLen == wxNO_LEN )
1403 srcLen = wxWcslen(src) + 1;
1404
1405 size_t outLen = 0;
1406 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1407 for ( size_t n = 0; n < srcLen; n++ )
1408 {
1409 wxUint16 cc[2];
1410 const size_t numChars = encode_utf16(*src++, cc);
1411 if ( numChars == wxCONV_FAILED )
1412 return wxCONV_FAILED;
1413
1414 outLen += numChars * BYTES_PER_CHAR;
1415 if ( outBuff )
1416 {
1417 if ( outLen > dstLen )
1418 return wxCONV_FAILED;
1419
1420 *outBuff++ = cc[0];
1421 if ( numChars == 2 )
1422 {
1423 // second character of a surrogate
1424 *outBuff++ = cc[1];
1425 }
1426 }
1427 }
1428
1429 return outLen;
1430 }
1431
1432 // ----------------------------------------------------------------------------
1433 // endian-reversing conversions
1434 // ----------------------------------------------------------------------------
1435
1436 size_t
1437 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1438 const char *src, size_t srcLen) const
1439 {
1440 srcLen = GetLength(src, srcLen);
1441 if ( srcLen == wxNO_LEN )
1442 return wxCONV_FAILED;
1443
1444 const size_t inLen = srcLen / BYTES_PER_CHAR;
1445 if ( !dst )
1446 {
1447 // optimization: return maximal space which could be needed for this
1448 // string even if the real size could be smaller if the buffer contains
1449 // any surrogates
1450 return inLen;
1451 }
1452
1453 size_t outLen = 0;
1454 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1455 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1456 {
1457 wxUint32 ch;
1458 wxUint16 tmp[2];
1459
1460 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1461 inBuff++;
1462 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1463
1464 const size_t numChars = decode_utf16(tmp, ch);
1465 if ( numChars == wxCONV_FAILED )
1466 return wxCONV_FAILED;
1467
1468 if ( numChars == 2 )
1469 inBuff++;
1470
1471 if ( ++outLen > dstLen )
1472 return wxCONV_FAILED;
1473
1474 *dst++ = ch;
1475 }
1476
1477
1478 return outLen;
1479 }
1480
1481 size_t
1482 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1483 const wchar_t *src, size_t srcLen) const
1484 {
1485 if ( srcLen == wxNO_LEN )
1486 srcLen = wxWcslen(src) + 1;
1487
1488 size_t outLen = 0;
1489 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1490 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1491 {
1492 wxUint16 cc[2];
1493 const size_t numChars = encode_utf16(*src, cc);
1494 if ( numChars == wxCONV_FAILED )
1495 return wxCONV_FAILED;
1496
1497 outLen += numChars * BYTES_PER_CHAR;
1498 if ( outBuff )
1499 {
1500 if ( outLen > dstLen )
1501 return wxCONV_FAILED;
1502
1503 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1504 if ( numChars == 2 )
1505 {
1506 // second character of a surrogate
1507 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1508 }
1509 }
1510 }
1511
1512 return outLen;
1513 }
1514
1515 #endif // WC_UTF16/!WC_UTF16
1516
1517
1518 // ============================================================================
1519 // UTF-32
1520 // ============================================================================
1521
1522 #ifdef WORDS_BIGENDIAN
1523 #define wxMBConvUTF32straight wxMBConvUTF32BE
1524 #define wxMBConvUTF32swap wxMBConvUTF32LE
1525 #else
1526 #define wxMBConvUTF32swap wxMBConvUTF32BE
1527 #define wxMBConvUTF32straight wxMBConvUTF32LE
1528 #endif
1529
1530
1531 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1532 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1533
1534 /* static */
1535 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1536 {
1537 if ( srcLen == wxNO_LEN )
1538 {
1539 // count the number of bytes in input, including the trailing NULs
1540 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1541 for ( srcLen = 1; *inBuff++; srcLen++ )
1542 ;
1543
1544 srcLen *= BYTES_PER_CHAR;
1545 }
1546 else // we already have the length
1547 {
1548 // we can only convert an entire number of UTF-32 characters
1549 if ( srcLen % BYTES_PER_CHAR )
1550 return wxCONV_FAILED;
1551 }
1552
1553 return srcLen;
1554 }
1555
1556 // case when in-memory representation is UTF-16
1557 #ifdef WC_UTF16
1558
1559 // ----------------------------------------------------------------------------
1560 // conversions without endianness change
1561 // ----------------------------------------------------------------------------
1562
1563 size_t
1564 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1565 const char *src, size_t srcLen) const
1566 {
1567 srcLen = GetLength(src, srcLen);
1568 if ( srcLen == wxNO_LEN )
1569 return wxCONV_FAILED;
1570
1571 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1572 const size_t inLen = srcLen / BYTES_PER_CHAR;
1573 size_t outLen = 0;
1574 for ( size_t n = 0; n < inLen; n++ )
1575 {
1576 wxUint16 cc[2];
1577 const size_t numChars = encode_utf16(*inBuff++, cc);
1578 if ( numChars == wxCONV_FAILED )
1579 return wxCONV_FAILED;
1580
1581 outLen += numChars;
1582 if ( dst )
1583 {
1584 if ( outLen > dstLen )
1585 return wxCONV_FAILED;
1586
1587 *dst++ = cc[0];
1588 if ( numChars == 2 )
1589 {
1590 // second character of a surrogate
1591 *dst++ = cc[1];
1592 }
1593 }
1594 }
1595
1596 return outLen;
1597 }
1598
1599 size_t
1600 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1601 const wchar_t *src, size_t srcLen) const
1602 {
1603 if ( srcLen == wxNO_LEN )
1604 srcLen = wxWcslen(src) + 1;
1605
1606 if ( !dst )
1607 {
1608 // optimization: return maximal space which could be needed for this
1609 // string instead of the exact amount which could be less if there are
1610 // any surrogates in the input
1611 //
1612 // we consider that surrogates are rare enough to make it worthwhile to
1613 // avoid running the loop below at the cost of slightly extra memory
1614 // consumption
1615 return srcLen * BYTES_PER_CHAR;
1616 }
1617
1618 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1619 size_t outLen = 0;
1620 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1621 {
1622 const wxUint32 ch = wxDecodeSurrogate(&src);
1623 if ( !src )
1624 return wxCONV_FAILED;
1625
1626 outLen += BYTES_PER_CHAR;
1627
1628 if ( outLen > dstLen )
1629 return wxCONV_FAILED;
1630
1631 *outBuff++ = ch;
1632 }
1633
1634 return outLen;
1635 }
1636
1637 // ----------------------------------------------------------------------------
1638 // endian-reversing conversions
1639 // ----------------------------------------------------------------------------
1640
1641 size_t
1642 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1643 const char *src, size_t srcLen) const
1644 {
1645 srcLen = GetLength(src, srcLen);
1646 if ( srcLen == wxNO_LEN )
1647 return wxCONV_FAILED;
1648
1649 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1650 const size_t inLen = srcLen / BYTES_PER_CHAR;
1651 size_t outLen = 0;
1652 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1653 {
1654 wxUint16 cc[2];
1655 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1656 if ( numChars == wxCONV_FAILED )
1657 return wxCONV_FAILED;
1658
1659 outLen += numChars;
1660 if ( dst )
1661 {
1662 if ( outLen > dstLen )
1663 return wxCONV_FAILED;
1664
1665 *dst++ = cc[0];
1666 if ( numChars == 2 )
1667 {
1668 // second character of a surrogate
1669 *dst++ = cc[1];
1670 }
1671 }
1672 }
1673
1674 return outLen;
1675 }
1676
1677 size_t
1678 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1679 const wchar_t *src, size_t srcLen) const
1680 {
1681 if ( srcLen == wxNO_LEN )
1682 srcLen = wxWcslen(src) + 1;
1683
1684 if ( !dst )
1685 {
1686 // optimization: return maximal space which could be needed for this
1687 // string instead of the exact amount which could be less if there are
1688 // any surrogates in the input
1689 //
1690 // we consider that surrogates are rare enough to make it worthwhile to
1691 // avoid running the loop below at the cost of slightly extra memory
1692 // consumption
1693 return srcLen*BYTES_PER_CHAR;
1694 }
1695
1696 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1697 size_t outLen = 0;
1698 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1699 {
1700 const wxUint32 ch = wxDecodeSurrogate(&src);
1701 if ( !src )
1702 return wxCONV_FAILED;
1703
1704 outLen += BYTES_PER_CHAR;
1705
1706 if ( outLen > dstLen )
1707 return wxCONV_FAILED;
1708
1709 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1710 }
1711
1712 return outLen;
1713 }
1714
1715 #else // !WC_UTF16: wchar_t is UTF-32
1716
1717 // ----------------------------------------------------------------------------
1718 // conversions without endianness change
1719 // ----------------------------------------------------------------------------
1720
1721 size_t
1722 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1723 const char *src, size_t srcLen) const
1724 {
1725 // use memcpy() as it should be much faster than hand-written loop
1726 srcLen = GetLength(src, srcLen);
1727 if ( srcLen == wxNO_LEN )
1728 return wxCONV_FAILED;
1729
1730 const size_t inLen = srcLen/BYTES_PER_CHAR;
1731 if ( dst )
1732 {
1733 if ( dstLen < inLen )
1734 return wxCONV_FAILED;
1735
1736 memcpy(dst, src, srcLen);
1737 }
1738
1739 return inLen;
1740 }
1741
1742 size_t
1743 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1744 const wchar_t *src, size_t srcLen) const
1745 {
1746 if ( srcLen == wxNO_LEN )
1747 srcLen = wxWcslen(src) + 1;
1748
1749 srcLen *= BYTES_PER_CHAR;
1750
1751 if ( dst )
1752 {
1753 if ( dstLen < srcLen )
1754 return wxCONV_FAILED;
1755
1756 memcpy(dst, src, srcLen);
1757 }
1758
1759 return srcLen;
1760 }
1761
1762 // ----------------------------------------------------------------------------
1763 // endian-reversing conversions
1764 // ----------------------------------------------------------------------------
1765
1766 size_t
1767 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1768 const char *src, size_t srcLen) const
1769 {
1770 srcLen = GetLength(src, srcLen);
1771 if ( srcLen == wxNO_LEN )
1772 return wxCONV_FAILED;
1773
1774 srcLen /= BYTES_PER_CHAR;
1775
1776 if ( dst )
1777 {
1778 if ( dstLen < srcLen )
1779 return wxCONV_FAILED;
1780
1781 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1782 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1783 {
1784 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1785 }
1786 }
1787
1788 return srcLen;
1789 }
1790
1791 size_t
1792 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1793 const wchar_t *src, size_t srcLen) const
1794 {
1795 if ( srcLen == wxNO_LEN )
1796 srcLen = wxWcslen(src) + 1;
1797
1798 srcLen *= BYTES_PER_CHAR;
1799
1800 if ( dst )
1801 {
1802 if ( dstLen < srcLen )
1803 return wxCONV_FAILED;
1804
1805 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1806 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1807 {
1808 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1809 }
1810 }
1811
1812 return srcLen;
1813 }
1814
1815 #endif // WC_UTF16/!WC_UTF16
1816
1817
1818 // ============================================================================
1819 // The classes doing conversion using the iconv_xxx() functions
1820 // ============================================================================
1821
1822 #ifdef HAVE_ICONV
1823
1824 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1825 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1826 // (unless there's yet another bug in glibc) the only case when iconv()
1827 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1828 // left in the input buffer -- when _real_ error occurs,
1829 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1830 // iconv() failure.
1831 // [This bug does not appear in glibc 2.2.]
1832 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1833 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1834 (errno != E2BIG || bufLeft != 0))
1835 #else
1836 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1837 #endif
1838
1839 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1840
1841 #define ICONV_T_INVALID ((iconv_t)-1)
1842
1843 #if SIZEOF_WCHAR_T == 4
1844 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1845 #define WC_ENC wxFONTENCODING_UTF32
1846 #elif SIZEOF_WCHAR_T == 2
1847 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1848 #define WC_ENC wxFONTENCODING_UTF16
1849 #else // sizeof(wchar_t) != 2 nor 4
1850 // does this ever happen?
1851 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1852 #endif
1853
1854 // ----------------------------------------------------------------------------
1855 // wxMBConv_iconv: encapsulates an iconv character set
1856 // ----------------------------------------------------------------------------
1857
1858 class wxMBConv_iconv : public wxMBConv
1859 {
1860 public:
1861 wxMBConv_iconv(const char *name);
1862 virtual ~wxMBConv_iconv();
1863
1864 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1865 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1866
1867 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1868 virtual size_t GetMBNulLen() const;
1869
1870 #if wxUSE_UNICODE_UTF8
1871 virtual bool IsUTF8() const;
1872 #endif
1873
1874 virtual wxMBConv *Clone() const
1875 {
1876 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1877 p->m_minMBCharWidth = m_minMBCharWidth;
1878 return p;
1879 }
1880
1881 bool IsOk() const
1882 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1883
1884 protected:
1885 // the iconv handlers used to translate from multibyte
1886 // to wide char and in the other direction
1887 iconv_t m2w,
1888 w2m;
1889
1890 #if wxUSE_THREADS
1891 // guards access to m2w and w2m objects
1892 wxMutex m_iconvMutex;
1893 #endif
1894
1895 private:
1896 // the name (for iconv_open()) of a wide char charset -- if none is
1897 // available on this machine, it will remain NULL
1898 static wxString ms_wcCharsetName;
1899
1900 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1901 // different endian-ness than the native one
1902 static bool ms_wcNeedsSwap;
1903
1904
1905 // name of the encoding handled by this conversion
1906 wxString m_name;
1907
1908 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1909 // initially
1910 size_t m_minMBCharWidth;
1911 };
1912
1913 // make the constructor available for unit testing
1914 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1915 {
1916 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1917 if ( !result->IsOk() )
1918 {
1919 delete result;
1920 return 0;
1921 }
1922
1923 return result;
1924 }
1925
1926 wxString wxMBConv_iconv::ms_wcCharsetName;
1927 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1928
1929 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1930 : m_name(name)
1931 {
1932 m_minMBCharWidth = 0;
1933
1934 // check for charset that represents wchar_t:
1935 if ( ms_wcCharsetName.empty() )
1936 {
1937 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1938
1939 #if wxUSE_FONTMAP
1940 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1941 #else // !wxUSE_FONTMAP
1942 static const wxChar *names_static[] =
1943 {
1944 #if SIZEOF_WCHAR_T == 4
1945 _T("UCS-4"),
1946 #elif SIZEOF_WCHAR_T = 2
1947 _T("UCS-2"),
1948 #endif
1949 NULL
1950 };
1951 const wxChar **names = names_static;
1952 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1953
1954 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1955 {
1956 const wxString nameCS(*names);
1957
1958 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1959 wxString nameXE(nameCS);
1960
1961 #ifdef WORDS_BIGENDIAN
1962 nameXE += _T("BE");
1963 #else // little endian
1964 nameXE += _T("LE");
1965 #endif
1966
1967 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1968 nameXE.c_str());
1969
1970 m2w = iconv_open(nameXE.ToAscii(), name);
1971 if ( m2w == ICONV_T_INVALID )
1972 {
1973 // try charset w/o bytesex info (e.g. "UCS4")
1974 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1975 nameCS.c_str());
1976 m2w = iconv_open(nameCS.ToAscii(), name);
1977
1978 // and check for bytesex ourselves:
1979 if ( m2w != ICONV_T_INVALID )
1980 {
1981 char buf[2], *bufPtr;
1982 wchar_t wbuf[2], *wbufPtr;
1983 size_t insz, outsz;
1984 size_t res;
1985
1986 buf[0] = 'A';
1987 buf[1] = 0;
1988 wbuf[0] = 0;
1989 insz = 2;
1990 outsz = SIZEOF_WCHAR_T * 2;
1991 wbufPtr = wbuf;
1992 bufPtr = buf;
1993
1994 res = iconv(
1995 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1996 (char**)&wbufPtr, &outsz);
1997
1998 if (ICONV_FAILED(res, insz))
1999 {
2000 wxLogLastError(wxT("iconv"));
2001 wxLogError(_("Conversion to charset '%s' doesn't work."),
2002 nameCS.c_str());
2003 }
2004 else // ok, can convert to this encoding, remember it
2005 {
2006 ms_wcCharsetName = nameCS;
2007 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2008 }
2009 }
2010 }
2011 else // use charset not requiring byte swapping
2012 {
2013 ms_wcCharsetName = nameXE;
2014 }
2015 }
2016
2017 wxLogTrace(TRACE_STRCONV,
2018 wxT("iconv wchar_t charset is \"%s\"%s"),
2019 ms_wcCharsetName.empty() ? wxString("<none>")
2020 : ms_wcCharsetName,
2021 ms_wcNeedsSwap ? _T(" (needs swap)")
2022 : _T(""));
2023 }
2024 else // we already have ms_wcCharsetName
2025 {
2026 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2027 }
2028
2029 if ( ms_wcCharsetName.empty() )
2030 {
2031 w2m = ICONV_T_INVALID;
2032 }
2033 else
2034 {
2035 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2036 if ( w2m == ICONV_T_INVALID )
2037 {
2038 wxLogTrace(TRACE_STRCONV,
2039 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2040 ms_wcCharsetName.c_str(), name);
2041 }
2042 }
2043 }
2044
2045 wxMBConv_iconv::~wxMBConv_iconv()
2046 {
2047 if ( m2w != ICONV_T_INVALID )
2048 iconv_close(m2w);
2049 if ( w2m != ICONV_T_INVALID )
2050 iconv_close(w2m);
2051 }
2052
2053 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2054 {
2055 // find the string length: notice that must be done differently for
2056 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2057 size_t inbuf;
2058 const size_t nulLen = GetMBNulLen();
2059 switch ( nulLen )
2060 {
2061 default:
2062 return wxCONV_FAILED;
2063
2064 case 1:
2065 inbuf = strlen(psz); // arguably more optimized than our version
2066 break;
2067
2068 case 2:
2069 case 4:
2070 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2071 // they also have to start at character boundary and not span two
2072 // adjacent characters
2073 const char *p;
2074 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2075 ;
2076 inbuf = p - psz;
2077 break;
2078 }
2079
2080 #if wxUSE_THREADS
2081 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2082 // Unfortunately there are a couple of global wxCSConv objects such as
2083 // wxConvLocal that are used all over wx code, so we have to make sure
2084 // the handle is used by at most one thread at the time. Otherwise
2085 // only a few wx classes would be safe to use from non-main threads
2086 // as MB<->WC conversion would fail "randomly".
2087 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2088 #endif // wxUSE_THREADS
2089
2090 size_t outbuf = n * SIZEOF_WCHAR_T;
2091 size_t res, cres;
2092 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2093 wchar_t *bufPtr = buf;
2094 const char *pszPtr = psz;
2095
2096 if (buf)
2097 {
2098 // have destination buffer, convert there
2099 cres = iconv(m2w,
2100 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2101 (char**)&bufPtr, &outbuf);
2102 res = n - (outbuf / SIZEOF_WCHAR_T);
2103
2104 if (ms_wcNeedsSwap)
2105 {
2106 // convert to native endianness
2107 for ( unsigned i = 0; i < res; i++ )
2108 buf[n] = WC_BSWAP(buf[i]);
2109 }
2110
2111 // NUL-terminate the string if there is any space left
2112 if (res < n)
2113 buf[res] = 0;
2114 }
2115 else
2116 {
2117 // no destination buffer... convert using temp buffer
2118 // to calculate destination buffer requirement
2119 wchar_t tbuf[8];
2120 res = 0;
2121
2122 do
2123 {
2124 bufPtr = tbuf;
2125 outbuf = 8 * SIZEOF_WCHAR_T;
2126
2127 cres = iconv(m2w,
2128 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2129 (char**)&bufPtr, &outbuf );
2130
2131 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2132 }
2133 while ((cres == (size_t)-1) && (errno == E2BIG));
2134 }
2135
2136 if (ICONV_FAILED(cres, inbuf))
2137 {
2138 //VS: it is ok if iconv fails, hence trace only
2139 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2140 return wxCONV_FAILED;
2141 }
2142
2143 return res;
2144 }
2145
2146 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2147 {
2148 #if wxUSE_THREADS
2149 // NB: explained in MB2WC
2150 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2151 #endif
2152
2153 size_t inlen = wxWcslen(psz);
2154 size_t inbuf = inlen * SIZEOF_WCHAR_T;
2155 size_t outbuf = n;
2156 size_t res, cres;
2157
2158 wchar_t *tmpbuf = 0;
2159
2160 if (ms_wcNeedsSwap)
2161 {
2162 // need to copy to temp buffer to switch endianness
2163 // (doing WC_BSWAP twice on the original buffer won't help, as it
2164 // could be in read-only memory, or be accessed in some other thread)
2165 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2166 for ( size_t i = 0; i < inlen; i++ )
2167 tmpbuf[n] = WC_BSWAP(psz[i]);
2168
2169 tmpbuf[inlen] = L'\0';
2170 psz = tmpbuf;
2171 }
2172
2173 if (buf)
2174 {
2175 // have destination buffer, convert there
2176 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2177
2178 res = n - outbuf;
2179
2180 // NB: iconv was given only wcslen(psz) characters on input, and so
2181 // it couldn't convert the trailing zero. Let's do it ourselves
2182 // if there's some room left for it in the output buffer.
2183 if (res < n)
2184 buf[0] = 0;
2185 }
2186 else
2187 {
2188 // no destination buffer: convert using temp buffer
2189 // to calculate destination buffer requirement
2190 char tbuf[16];
2191 res = 0;
2192 do
2193 {
2194 buf = tbuf;
2195 outbuf = 16;
2196
2197 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2198
2199 res += 16 - outbuf;
2200 }
2201 while ((cres == (size_t)-1) && (errno == E2BIG));
2202 }
2203
2204 if (ms_wcNeedsSwap)
2205 {
2206 free(tmpbuf);
2207 }
2208
2209 if (ICONV_FAILED(cres, inbuf))
2210 {
2211 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2212 return wxCONV_FAILED;
2213 }
2214
2215 return res;
2216 }
2217
2218 size_t wxMBConv_iconv::GetMBNulLen() const
2219 {
2220 if ( m_minMBCharWidth == 0 )
2221 {
2222 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2223
2224 #if wxUSE_THREADS
2225 // NB: explained in MB2WC
2226 wxMutexLocker lock(self->m_iconvMutex);
2227 #endif
2228
2229 const wchar_t *wnul = L"";
2230 char buf[8]; // should be enough for NUL in any encoding
2231 size_t inLen = sizeof(wchar_t),
2232 outLen = WXSIZEOF(buf);
2233 char *inBuff = (char *)wnul;
2234 char *outBuff = buf;
2235 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2236 {
2237 self->m_minMBCharWidth = (size_t)-1;
2238 }
2239 else // ok
2240 {
2241 self->m_minMBCharWidth = outBuff - buf;
2242 }
2243 }
2244
2245 return m_minMBCharWidth;
2246 }
2247
2248 #if wxUSE_UNICODE_UTF8
2249 bool wxMBConv_iconv::IsUTF8() const
2250 {
2251 return wxStricmp(m_name, "UTF-8") == 0 ||
2252 wxStricmp(m_name, "UTF8") == 0;
2253 }
2254 #endif
2255
2256 #endif // HAVE_ICONV
2257
2258
2259 // ============================================================================
2260 // Win32 conversion classes
2261 // ============================================================================
2262
2263 #ifdef wxHAVE_WIN32_MB2WC
2264
2265 // from utils.cpp
2266 #if wxUSE_FONTMAP
2267 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2268 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2269 #endif
2270
2271 class wxMBConv_win32 : public wxMBConv
2272 {
2273 public:
2274 wxMBConv_win32()
2275 {
2276 m_CodePage = CP_ACP;
2277 m_minMBCharWidth = 0;
2278 }
2279
2280 wxMBConv_win32(const wxMBConv_win32& conv)
2281 : wxMBConv()
2282 {
2283 m_CodePage = conv.m_CodePage;
2284 m_minMBCharWidth = conv.m_minMBCharWidth;
2285 }
2286
2287 #if wxUSE_FONTMAP
2288 wxMBConv_win32(const char* name)
2289 {
2290 m_CodePage = wxCharsetToCodepage(name);
2291 m_minMBCharWidth = 0;
2292 }
2293
2294 wxMBConv_win32(wxFontEncoding encoding)
2295 {
2296 m_CodePage = wxEncodingToCodepage(encoding);
2297 m_minMBCharWidth = 0;
2298 }
2299 #endif // wxUSE_FONTMAP
2300
2301 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2302 {
2303 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2304 // the behaviour is not compatible with the Unix version (using iconv)
2305 // and break the library itself, e.g. wxTextInputStream::NextChar()
2306 // wouldn't work if reading an incomplete MB char didn't result in an
2307 // error
2308 //
2309 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2310 // Win XP or newer and it is not supported for UTF-[78] so we always
2311 // use our own conversions in this case. See
2312 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2313 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2314 if ( m_CodePage == CP_UTF8 )
2315 {
2316 return wxMBConvUTF8().MB2WC(buf, psz, n);
2317 }
2318
2319 if ( m_CodePage == CP_UTF7 )
2320 {
2321 return wxMBConvUTF7().MB2WC(buf, psz, n);
2322 }
2323
2324 int flags = 0;
2325 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2326 IsAtLeastWin2kSP4() )
2327 {
2328 flags = MB_ERR_INVALID_CHARS;
2329 }
2330
2331 const size_t len = ::MultiByteToWideChar
2332 (
2333 m_CodePage, // code page
2334 flags, // flags: fall on error
2335 psz, // input string
2336 -1, // its length (NUL-terminated)
2337 buf, // output string
2338 buf ? n : 0 // size of output buffer
2339 );
2340 if ( !len )
2341 {
2342 // function totally failed
2343 return wxCONV_FAILED;
2344 }
2345
2346 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2347 // check if we succeeded, by doing a double trip:
2348 if ( !flags && buf )
2349 {
2350 const size_t mbLen = strlen(psz);
2351 wxCharBuffer mbBuf(mbLen);
2352 if ( ::WideCharToMultiByte
2353 (
2354 m_CodePage,
2355 0,
2356 buf,
2357 -1,
2358 mbBuf.data(),
2359 mbLen + 1, // size in bytes, not length
2360 NULL,
2361 NULL
2362 ) == 0 ||
2363 strcmp(mbBuf, psz) != 0 )
2364 {
2365 // we didn't obtain the same thing we started from, hence
2366 // the conversion was lossy and we consider that it failed
2367 return wxCONV_FAILED;
2368 }
2369 }
2370
2371 // note that it returns count of written chars for buf != NULL and size
2372 // of the needed buffer for buf == NULL so in either case the length of
2373 // the string (which never includes the terminating NUL) is one less
2374 return len - 1;
2375 }
2376
2377 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2378 {
2379 /*
2380 we have a problem here: by default, WideCharToMultiByte() may
2381 replace characters unrepresentable in the target code page with bad
2382 quality approximations such as turning "1/2" symbol (U+00BD) into
2383 "1" for the code pages which don't have it and we, obviously, want
2384 to avoid this at any price
2385
2386 the trouble is that this function does it _silently_, i.e. it won't
2387 even tell us whether it did or not... Win98/2000 and higher provide
2388 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2389 we have to resort to a round trip, i.e. check that converting back
2390 results in the same string -- this is, of course, expensive but
2391 otherwise we simply can't be sure to not garble the data.
2392 */
2393
2394 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2395 // it doesn't work with CJK encodings (which we test for rather roughly
2396 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2397 // supporting it
2398 BOOL usedDef wxDUMMY_INITIALIZE(false);
2399 BOOL *pUsedDef;
2400 int flags;
2401 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2402 {
2403 // it's our lucky day
2404 flags = WC_NO_BEST_FIT_CHARS;
2405 pUsedDef = &usedDef;
2406 }
2407 else // old system or unsupported encoding
2408 {
2409 flags = 0;
2410 pUsedDef = NULL;
2411 }
2412
2413 const size_t len = ::WideCharToMultiByte
2414 (
2415 m_CodePage, // code page
2416 flags, // either none or no best fit
2417 pwz, // input string
2418 -1, // it is (wide) NUL-terminated
2419 buf, // output buffer
2420 buf ? n : 0, // and its size
2421 NULL, // default "replacement" char
2422 pUsedDef // [out] was it used?
2423 );
2424
2425 if ( !len )
2426 {
2427 // function totally failed
2428 return wxCONV_FAILED;
2429 }
2430
2431 // we did something, check if we really succeeded
2432 if ( flags )
2433 {
2434 // check if the conversion failed, i.e. if any replacements
2435 // were done
2436 if ( usedDef )
2437 return wxCONV_FAILED;
2438 }
2439 else // we must resort to double tripping...
2440 {
2441 // first we need to ensure that we really have the MB data: this is
2442 // not the case if we're called with NULL buffer, in which case we
2443 // need to do the conversion yet again
2444 wxCharBuffer bufDef;
2445 if ( !buf )
2446 {
2447 bufDef = wxCharBuffer(len);
2448 buf = bufDef.data();
2449 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2450 buf, len, NULL, NULL) )
2451 return wxCONV_FAILED;
2452 }
2453
2454 wxWCharBuffer wcBuf(n);
2455 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2456 wcscmp(wcBuf, pwz) != 0 )
2457 {
2458 // we didn't obtain the same thing we started from, hence
2459 // the conversion was lossy and we consider that it failed
2460 return wxCONV_FAILED;
2461 }
2462 }
2463
2464 // see the comment above for the reason of "len - 1"
2465 return len - 1;
2466 }
2467
2468 virtual size_t GetMBNulLen() const
2469 {
2470 if ( m_minMBCharWidth == 0 )
2471 {
2472 int len = ::WideCharToMultiByte
2473 (
2474 m_CodePage, // code page
2475 0, // no flags
2476 L"", // input string
2477 1, // translate just the NUL
2478 NULL, // output buffer
2479 0, // and its size
2480 NULL, // no replacement char
2481 NULL // [out] don't care if it was used
2482 );
2483
2484 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2485 switch ( len )
2486 {
2487 default:
2488 wxLogDebug(_T("Unexpected NUL length %d"), len);
2489 self->m_minMBCharWidth = (size_t)-1;
2490 break;
2491
2492 case 0:
2493 self->m_minMBCharWidth = (size_t)-1;
2494 break;
2495
2496 case 1:
2497 case 2:
2498 case 4:
2499 self->m_minMBCharWidth = len;
2500 break;
2501 }
2502 }
2503
2504 return m_minMBCharWidth;
2505 }
2506
2507 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2508
2509 bool IsOk() const { return m_CodePage != -1; }
2510
2511 private:
2512 static bool CanUseNoBestFit()
2513 {
2514 static int s_isWin98Or2k = -1;
2515
2516 if ( s_isWin98Or2k == -1 )
2517 {
2518 int verMaj, verMin;
2519 switch ( wxGetOsVersion(&verMaj, &verMin) )
2520 {
2521 case wxOS_WINDOWS_9X:
2522 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2523 break;
2524
2525 case wxOS_WINDOWS_NT:
2526 s_isWin98Or2k = verMaj >= 5;
2527 break;
2528
2529 default:
2530 // unknown: be conservative by default
2531 s_isWin98Or2k = 0;
2532 break;
2533 }
2534
2535 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2536 }
2537
2538 return s_isWin98Or2k == 1;
2539 }
2540
2541 static bool IsAtLeastWin2kSP4()
2542 {
2543 #ifdef __WXWINCE__
2544 return false;
2545 #else
2546 static int s_isAtLeastWin2kSP4 = -1;
2547
2548 if ( s_isAtLeastWin2kSP4 == -1 )
2549 {
2550 OSVERSIONINFOEX ver;
2551
2552 memset(&ver, 0, sizeof(ver));
2553 ver.dwOSVersionInfoSize = sizeof(ver);
2554 GetVersionEx((OSVERSIONINFO*)&ver);
2555
2556 s_isAtLeastWin2kSP4 =
2557 ((ver.dwMajorVersion > 5) || // Vista+
2558 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2559 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2560 ver.wServicePackMajor >= 4)) // 2000 SP4+
2561 ? 1 : 0;
2562 }
2563
2564 return s_isAtLeastWin2kSP4 == 1;
2565 #endif
2566 }
2567
2568
2569 // the code page we're working with
2570 long m_CodePage;
2571
2572 // cached result of GetMBNulLen(), set to 0 initially meaning
2573 // "unknown"
2574 size_t m_minMBCharWidth;
2575 };
2576
2577 #endif // wxHAVE_WIN32_MB2WC
2578
2579
2580 // ============================================================================
2581 // wxEncodingConverter based conversion classes
2582 // ============================================================================
2583
2584 #if wxUSE_FONTMAP
2585
2586 class wxMBConv_wxwin : public wxMBConv
2587 {
2588 private:
2589 void Init()
2590 {
2591 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2592 // The wxMBConv_cf class does a better job.
2593 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2594 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2595 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2596 }
2597
2598 public:
2599 // temporarily just use wxEncodingConverter stuff,
2600 // so that it works while a better implementation is built
2601 wxMBConv_wxwin(const char* name)
2602 {
2603 if (name)
2604 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2605 else
2606 m_enc = wxFONTENCODING_SYSTEM;
2607
2608 Init();
2609 }
2610
2611 wxMBConv_wxwin(wxFontEncoding enc)
2612 {
2613 m_enc = enc;
2614
2615 Init();
2616 }
2617
2618 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2619 {
2620 size_t inbuf = strlen(psz);
2621 if (buf)
2622 {
2623 if (!m2w.Convert(psz, buf))
2624 return wxCONV_FAILED;
2625 }
2626 return inbuf;
2627 }
2628
2629 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2630 {
2631 const size_t inbuf = wxWcslen(psz);
2632 if (buf)
2633 {
2634 if (!w2m.Convert(psz, buf))
2635 return wxCONV_FAILED;
2636 }
2637
2638 return inbuf;
2639 }
2640
2641 virtual size_t GetMBNulLen() const
2642 {
2643 switch ( m_enc )
2644 {
2645 case wxFONTENCODING_UTF16BE:
2646 case wxFONTENCODING_UTF16LE:
2647 return 2;
2648
2649 case wxFONTENCODING_UTF32BE:
2650 case wxFONTENCODING_UTF32LE:
2651 return 4;
2652
2653 default:
2654 return 1;
2655 }
2656 }
2657
2658 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2659
2660 bool IsOk() const { return m_ok; }
2661
2662 public:
2663 wxFontEncoding m_enc;
2664 wxEncodingConverter m2w, w2m;
2665
2666 private:
2667 // were we initialized successfully?
2668 bool m_ok;
2669
2670 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2671 };
2672
2673 // make the constructors available for unit testing
2674 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2675 {
2676 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2677 if ( !result->IsOk() )
2678 {
2679 delete result;
2680 return 0;
2681 }
2682
2683 return result;
2684 }
2685
2686 #endif // wxUSE_FONTMAP
2687
2688 // ============================================================================
2689 // wxCSConv implementation
2690 // ============================================================================
2691
2692 void wxCSConv::Init()
2693 {
2694 m_name = NULL;
2695 m_convReal = NULL;
2696 m_deferred = true;
2697 }
2698
2699 wxCSConv::wxCSConv(const wxString& charset)
2700 {
2701 Init();
2702
2703 if ( !charset.empty() )
2704 {
2705 SetName(charset.ToAscii());
2706 }
2707
2708 #if wxUSE_FONTMAP
2709 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2710 #else
2711 m_encoding = wxFONTENCODING_SYSTEM;
2712 #endif
2713 }
2714
2715 wxCSConv::wxCSConv(wxFontEncoding encoding)
2716 {
2717 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2718 {
2719 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2720
2721 encoding = wxFONTENCODING_SYSTEM;
2722 }
2723
2724 Init();
2725
2726 m_encoding = encoding;
2727 }
2728
2729 wxCSConv::~wxCSConv()
2730 {
2731 Clear();
2732 }
2733
2734 wxCSConv::wxCSConv(const wxCSConv& conv)
2735 : wxMBConv()
2736 {
2737 Init();
2738
2739 SetName(conv.m_name);
2740 m_encoding = conv.m_encoding;
2741 }
2742
2743 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2744 {
2745 Clear();
2746
2747 SetName(conv.m_name);
2748 m_encoding = conv.m_encoding;
2749
2750 return *this;
2751 }
2752
2753 void wxCSConv::Clear()
2754 {
2755 free(m_name);
2756 delete m_convReal;
2757
2758 m_name = NULL;
2759 m_convReal = NULL;
2760 }
2761
2762 void wxCSConv::SetName(const char *charset)
2763 {
2764 if (charset)
2765 {
2766 m_name = wxStrdup(charset);
2767 m_deferred = true;
2768 }
2769 }
2770
2771 #if wxUSE_FONTMAP
2772
2773 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2774 wxEncodingNameCache );
2775
2776 static wxEncodingNameCache gs_nameCache;
2777 #endif
2778
2779 wxMBConv *wxCSConv::DoCreate() const
2780 {
2781 #if wxUSE_FONTMAP
2782 wxLogTrace(TRACE_STRCONV,
2783 wxT("creating conversion for %s"),
2784 (m_name ? m_name
2785 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2786 #endif // wxUSE_FONTMAP
2787
2788 // check for the special case of ASCII or ISO8859-1 charset: as we have
2789 // special knowledge of it anyhow, we don't need to create a special
2790 // conversion object
2791 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2792 m_encoding == wxFONTENCODING_DEFAULT )
2793 {
2794 // don't convert at all
2795 return NULL;
2796 }
2797
2798 // we trust OS to do conversion better than we can so try external
2799 // conversion methods first
2800 //
2801 // the full order is:
2802 // 1. OS conversion (iconv() under Unix or Win32 API)
2803 // 2. hard coded conversions for UTF
2804 // 3. wxEncodingConverter as fall back
2805
2806 // step (1)
2807 #ifdef HAVE_ICONV
2808 #if !wxUSE_FONTMAP
2809 if ( m_name )
2810 #endif // !wxUSE_FONTMAP
2811 {
2812 #if wxUSE_FONTMAP
2813 wxFontEncoding encoding(m_encoding);
2814 #endif
2815
2816 if ( m_name )
2817 {
2818 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2819 if ( conv->IsOk() )
2820 return conv;
2821
2822 delete conv;
2823
2824 #if wxUSE_FONTMAP
2825 encoding =
2826 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2827 #endif // wxUSE_FONTMAP
2828 }
2829 #if wxUSE_FONTMAP
2830 {
2831 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2832 if ( it != gs_nameCache.end() )
2833 {
2834 if ( it->second.empty() )
2835 return NULL;
2836
2837 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2838 if ( conv->IsOk() )
2839 return conv;
2840
2841 delete conv;
2842 }
2843
2844 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2845 // CS : in case this does not return valid names (eg for MacRoman)
2846 // encoding got a 'failure' entry in the cache all the same,
2847 // although it just has to be created using a different method, so
2848 // only store failed iconv creation attempts (or perhaps we
2849 // shoulnd't do this at all ?)
2850 if ( names[0] != NULL )
2851 {
2852 for ( ; *names; ++names )
2853 {
2854 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2855 // will need changes that will obsolete this
2856 wxString name(*names);
2857 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2858 if ( conv->IsOk() )
2859 {
2860 gs_nameCache[encoding] = *names;
2861 return conv;
2862 }
2863
2864 delete conv;
2865 }
2866
2867 gs_nameCache[encoding] = _T(""); // cache the failure
2868 }
2869 }
2870 #endif // wxUSE_FONTMAP
2871 }
2872 #endif // HAVE_ICONV
2873
2874 #ifdef wxHAVE_WIN32_MB2WC
2875 {
2876 #if wxUSE_FONTMAP
2877 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2878 : new wxMBConv_win32(m_encoding);
2879 if ( conv->IsOk() )
2880 return conv;
2881
2882 delete conv;
2883 #else
2884 return NULL;
2885 #endif
2886 }
2887 #endif // wxHAVE_WIN32_MB2WC
2888
2889 #ifdef __DARWIN__
2890 {
2891 // leave UTF16 and UTF32 to the built-ins of wx
2892 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2893 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2894 {
2895 #if wxUSE_FONTMAP
2896 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2897 : new wxMBConv_cf(m_encoding);
2898 #else
2899 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2900 #endif
2901
2902 if ( conv->IsOk() )
2903 return conv;
2904
2905 delete conv;
2906 }
2907 }
2908 #endif // __DARWIN__
2909
2910 // step (2)
2911 wxFontEncoding enc = m_encoding;
2912 #if wxUSE_FONTMAP
2913 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2914 {
2915 // use "false" to suppress interactive dialogs -- we can be called from
2916 // anywhere and popping up a dialog from here is the last thing we want to
2917 // do
2918 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2919 }
2920 #endif // wxUSE_FONTMAP
2921
2922 switch ( enc )
2923 {
2924 case wxFONTENCODING_UTF7:
2925 return new wxMBConvUTF7;
2926
2927 case wxFONTENCODING_UTF8:
2928 return new wxMBConvUTF8;
2929
2930 case wxFONTENCODING_UTF16BE:
2931 return new wxMBConvUTF16BE;
2932
2933 case wxFONTENCODING_UTF16LE:
2934 return new wxMBConvUTF16LE;
2935
2936 case wxFONTENCODING_UTF32BE:
2937 return new wxMBConvUTF32BE;
2938
2939 case wxFONTENCODING_UTF32LE:
2940 return new wxMBConvUTF32LE;
2941
2942 default:
2943 // nothing to do but put here to suppress gcc warnings
2944 break;
2945 }
2946
2947 // step (3)
2948 #if wxUSE_FONTMAP
2949 {
2950 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2951 : new wxMBConv_wxwin(m_encoding);
2952 if ( conv->IsOk() )
2953 return conv;
2954
2955 delete conv;
2956 }
2957 #endif // wxUSE_FONTMAP
2958
2959 // NB: This is a hack to prevent deadlock. What could otherwise happen
2960 // in Unicode build: wxConvLocal creation ends up being here
2961 // because of some failure and logs the error. But wxLog will try to
2962 // attach a timestamp, for which it will need wxConvLocal (to convert
2963 // time to char* and then wchar_t*), but that fails, tries to log the
2964 // error, but wxLog has an (already locked) critical section that
2965 // guards the static buffer.
2966 static bool alreadyLoggingError = false;
2967 if (!alreadyLoggingError)
2968 {
2969 alreadyLoggingError = true;
2970 wxLogError(_("Cannot convert from the charset '%s'!"),
2971 m_name ? m_name
2972 :
2973 #if wxUSE_FONTMAP
2974 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2975 #else // !wxUSE_FONTMAP
2976 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2977 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2978 );
2979
2980 alreadyLoggingError = false;
2981 }
2982
2983 return NULL;
2984 }
2985
2986 void wxCSConv::CreateConvIfNeeded() const
2987 {
2988 if ( m_deferred )
2989 {
2990 wxCSConv *self = (wxCSConv *)this; // const_cast
2991
2992 // if we don't have neither the name nor the encoding, use the default
2993 // encoding for this system
2994 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2995 {
2996 #if wxUSE_INTL
2997 self->m_encoding = wxLocale::GetSystemEncoding();
2998 #else
2999 // fallback to some reasonable default:
3000 self->m_encoding = wxFONTENCODING_ISO8859_1;
3001 #endif // wxUSE_INTL
3002 }
3003
3004 self->m_convReal = DoCreate();
3005 self->m_deferred = false;
3006 }
3007 }
3008
3009 bool wxCSConv::IsOk() const
3010 {
3011 CreateConvIfNeeded();
3012
3013 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3014 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3015 return true; // always ok as we do it ourselves
3016
3017 // m_convReal->IsOk() is called at its own creation, so we know it must
3018 // be ok if m_convReal is non-NULL
3019 return m_convReal != NULL;
3020 }
3021
3022 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3023 const char *src, size_t srcLen) const
3024 {
3025 CreateConvIfNeeded();
3026
3027 if (m_convReal)
3028 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3029
3030 // latin-1 (direct)
3031 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3032 }
3033
3034 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3035 const wchar_t *src, size_t srcLen) const
3036 {
3037 CreateConvIfNeeded();
3038
3039 if (m_convReal)
3040 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3041
3042 // latin-1 (direct)
3043 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3044 }
3045
3046 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3047 {
3048 CreateConvIfNeeded();
3049
3050 if (m_convReal)
3051 return m_convReal->MB2WC(buf, psz, n);
3052
3053 // latin-1 (direct)
3054 size_t len = strlen(psz);
3055
3056 if (buf)
3057 {
3058 for (size_t c = 0; c <= len; c++)
3059 buf[c] = (unsigned char)(psz[c]);
3060 }
3061
3062 return len;
3063 }
3064
3065 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3066 {
3067 CreateConvIfNeeded();
3068
3069 if (m_convReal)
3070 return m_convReal->WC2MB(buf, psz, n);
3071
3072 // latin-1 (direct)
3073 const size_t len = wxWcslen(psz);
3074 if (buf)
3075 {
3076 for (size_t c = 0; c <= len; c++)
3077 {
3078 if (psz[c] > 0xFF)
3079 return wxCONV_FAILED;
3080
3081 buf[c] = (char)psz[c];
3082 }
3083 }
3084 else
3085 {
3086 for (size_t c = 0; c <= len; c++)
3087 {
3088 if (psz[c] > 0xFF)
3089 return wxCONV_FAILED;
3090 }
3091 }
3092
3093 return len;
3094 }
3095
3096 size_t wxCSConv::GetMBNulLen() const
3097 {
3098 CreateConvIfNeeded();
3099
3100 if ( m_convReal )
3101 {
3102 return m_convReal->GetMBNulLen();
3103 }
3104
3105 // otherwise, we are ISO-8859-1
3106 return 1;
3107 }
3108
3109 #if wxUSE_UNICODE_UTF8
3110 bool wxCSConv::IsUTF8() const
3111 {
3112 CreateConvIfNeeded();
3113
3114 if ( m_convReal )
3115 {
3116 return m_convReal->IsUTF8();
3117 }
3118
3119 // otherwise, we are ISO-8859-1
3120 return false;
3121 }
3122 #endif
3123
3124
3125 #if wxUSE_UNICODE
3126
3127 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3128 {
3129 if ( !s )
3130 return wxWCharBuffer();
3131
3132 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3133 if ( !wbuf )
3134 wbuf = wxMBConvUTF8().cMB2WX(s);
3135 if ( !wbuf )
3136 wbuf = wxConvISO8859_1.cMB2WX(s);
3137
3138 return wbuf;
3139 }
3140
3141 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3142 {
3143 if ( !ws )
3144 return wxCharBuffer();
3145
3146 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3147 if ( !buf )
3148 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3149
3150 return buf;
3151 }
3152
3153 #endif // wxUSE_UNICODE
3154
3155 // ----------------------------------------------------------------------------
3156 // globals
3157 // ----------------------------------------------------------------------------
3158
3159 // NB: The reason why we create converted objects in this convoluted way,
3160 // using a factory function instead of global variable, is that they
3161 // may be used at static initialization time (some of them are used by
3162 // wxString ctors and there may be a global wxString object). In other
3163 // words, possibly _before_ the converter global object would be
3164 // initialized.
3165
3166 #undef wxConvLibc
3167 #undef wxConvUTF8
3168 #undef wxConvUTF7
3169 #undef wxConvLocal
3170 #undef wxConvISO8859_1
3171
3172 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3173 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3174 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3175 { \
3176 static impl_klass name##Obj ctor_args; \
3177 return &name##Obj; \
3178 } \
3179 /* this ensures that all global converter objects are created */ \
3180 /* by the time static initialization is done, i.e. before any */ \
3181 /* thread is launched: */ \
3182 static klass* gs_##name##instance = wxGet_##name##Ptr()
3183
3184 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3185 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3186
3187 #ifdef __WINDOWS__
3188 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3189 #else
3190 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3191 #endif
3192
3193 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3194 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3195 // provokes an error message about "not enough macro parameters"; and we
3196 // can't use "()" here as the name##Obj declaration would be parsed as a
3197 // function declaration then, so use a semicolon and live with an extra
3198 // empty statement (and hope that no compilers warns about this)
3199 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3200 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3201
3202 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3203 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3204
3205 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3206 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3207
3208 #ifdef __DARWIN__
3209 // The xnu kernel always communicates file paths in decomposed UTF-8.
3210 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3211 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3212 #endif
3213
3214 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3215 #ifdef __DARWIN__
3216 &wxConvMacUTF8DObj;
3217 #else // !__DARWIN__
3218 wxGet_wxConvLibcPtr();
3219 #endif // __DARWIN__/!__DARWIN__
3220
3221 #else // !wxUSE_WCHAR_T
3222
3223 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3224 // stand-ins in absence of wchar_t
3225 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3226 wxConvISO8859_1,
3227 wxConvLocal,
3228 wxConvUTF8;
3229
3230 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T