]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
4d672fedfb124af18c47b2927a4f7142f791954d
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef __SALFORDC__
48 #include <clib.h>
49 #endif
50
51 #ifdef HAVE_ICONV
52 #include <iconv.h>
53 #include "wx/thread.h"
54 #endif
55
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
58
59 #ifdef __DARWIN__
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
62
63
64 #define TRACE_STRCONV _T("strconv")
65
66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
67 // be 4 bytes
68 #if SIZEOF_WCHAR_T == 2
69 #define WC_UTF16
70 #endif
71
72
73 // ============================================================================
74 // implementation
75 // ============================================================================
76
77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
78 static bool NotAllNULs(const char *p, size_t n)
79 {
80 while ( n && *p++ == '\0' )
81 n--;
82
83 return n != 0;
84 }
85
86 // ----------------------------------------------------------------------------
87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
88 // ----------------------------------------------------------------------------
89
90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
91 {
92 if (input <= 0xffff)
93 {
94 if (output)
95 *output = (wxUint16) input;
96
97 return 1;
98 }
99 else if (input >= 0x110000)
100 {
101 return wxCONV_FAILED;
102 }
103 else
104 {
105 if (output)
106 {
107 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
108 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
109 }
110
111 return 2;
112 }
113 }
114
115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
116 {
117 if ((*input < 0xd800) || (*input > 0xdfff))
118 {
119 output = *input;
120 return 1;
121 }
122 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
123 {
124 output = *input;
125 return wxCONV_FAILED;
126 }
127 else
128 {
129 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
130 return 2;
131 }
132 }
133
134 #ifdef WC_UTF16
135 typedef wchar_t wxDecodeSurrogate_t;
136 #else // !WC_UTF16
137 typedef wxUint16 wxDecodeSurrogate_t;
138 #endif // WC_UTF16/!WC_UTF16
139
140 // returns the next UTF-32 character from the wchar_t buffer and advances the
141 // pointer to the character after this one
142 //
143 // if an invalid character is found, *pSrc is set to NULL, the caller must
144 // check for this
145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
146 {
147 wxUint32 out;
148 const size_t
149 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
150 if ( n == wxCONV_FAILED )
151 *pSrc = NULL;
152 else
153 *pSrc += n;
154
155 return out;
156 }
157
158 // ----------------------------------------------------------------------------
159 // wxMBConv
160 // ----------------------------------------------------------------------------
161
162 size_t
163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
164 const char *src, size_t srcLen) const
165 {
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 for ( ;; )
213 {
214 // try to convert the current chunk
215 size_t lenChunk = MB2WC(NULL, src, 0);
216 if ( lenChunk == wxCONV_FAILED )
217 return wxCONV_FAILED;
218
219 lenChunk++; // for the L'\0' at the end of this chunk
220
221 dstWritten += lenChunk;
222
223 if ( lenChunk == 1 )
224 {
225 // nothing left in the input string, conversion succeeded
226 break;
227 }
228
229 if ( dst )
230 {
231 if ( dstWritten > dstLen )
232 return wxCONV_FAILED;
233
234 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
235 return wxCONV_FAILED;
236
237 dst += lenChunk;
238 }
239
240 if ( !srcEnd )
241 {
242 // we convert just one chunk in this case as this is the entire
243 // string anyhow
244 break;
245 }
246
247 // advance the input pointer past the end of this chunk
248 while ( NotAllNULs(src, nulLen) )
249 {
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
254 src += nulLen;
255 }
256
257 src += nulLen; // skipping over its terminator as well
258
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
262 if ( src >= srcEnd )
263 break;
264 }
265
266 return dstWritten;
267 }
268
269 size_t
270 wxMBConv::FromWChar(char *dst, size_t dstLen,
271 const wchar_t *src, size_t srcLen) const
272 {
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten = 0;
275
276 // make a copy of the input string unless it is already properly
277 // NUL-terminated
278 //
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp;
282 if ( srcLen == wxNO_LEN )
283 {
284 srcLen = wxWcslen(src) + 1;
285 }
286 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
287 {
288 // make a copy in order to properly NUL-terminate the string
289 bufTmp = wxWCharBuffer(srcLen);
290 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
291 src = bufTmp;
292 }
293
294 const size_t lenNul = GetMBNulLen();
295 for ( const wchar_t * const srcEnd = src + srcLen;
296 src < srcEnd;
297 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
298 {
299 // try to convert the current chunk
300 size_t lenChunk = WC2MB(NULL, src, 0);
301
302 if ( lenChunk == wxCONV_FAILED )
303 return wxCONV_FAILED;
304
305 lenChunk += lenNul;
306 dstWritten += lenChunk;
307
308 if ( dst )
309 {
310 if ( dstWritten > dstLen )
311 return wxCONV_FAILED;
312
313 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dst += lenChunk;
317 }
318 }
319
320 return dstWritten;
321 }
322
323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
324 {
325 size_t rc = ToWChar(outBuff, outLen, inBuff);
326 if ( rc != wxCONV_FAILED )
327 {
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
330 rc--;
331 }
332
333 return rc;
334 }
335
336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
337 {
338 size_t rc = FromWChar(outBuff, outLen, inBuff);
339 if ( rc != wxCONV_FAILED )
340 {
341 rc -= GetMBNulLen();
342 }
343
344 return rc;
345 }
346
347 wxMBConv::~wxMBConv()
348 {
349 // nothing to do here (necessary for Darwin linking probably)
350 }
351
352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
353 {
354 if ( psz )
355 {
356 // calculate the length of the buffer needed first
357 const size_t nLen = MB2WC(NULL, psz, 0);
358 if ( nLen != wxCONV_FAILED )
359 {
360 // now do the actual conversion
361 wxWCharBuffer buf(nLen /* +1 added implicitly */);
362
363 // +1 for the trailing NULL
364 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
365 return buf;
366 }
367 }
368
369 return wxWCharBuffer();
370 }
371
372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
373 {
374 if ( pwz )
375 {
376 const size_t nLen = WC2MB(NULL, pwz, 0);
377 if ( nLen != wxCONV_FAILED )
378 {
379 // extra space for trailing NUL(s)
380 static const size_t extraLen = GetMaxMBNulLen();
381
382 wxCharBuffer buf(nLen + extraLen - 1);
383 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
384 return buf;
385 }
386 }
387
388 return wxCharBuffer();
389 }
390
391 const wxWCharBuffer
392 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
393 {
394 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
395 if ( dstLen != wxCONV_FAILED )
396 {
397 wxWCharBuffer wbuf(dstLen - 1);
398 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
399 {
400 if ( outLen )
401 {
402 *outLen = dstLen;
403 if ( wbuf[dstLen - 1] == L'\0' )
404 (*outLen)--;
405 }
406
407 return wbuf;
408 }
409 }
410
411 if ( outLen )
412 *outLen = 0;
413
414 return wxWCharBuffer();
415 }
416
417 const wxCharBuffer
418 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
419 {
420 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
421 if ( dstLen != wxCONV_FAILED )
422 {
423 // special case of empty input: can't allocate 0 size buffer below as
424 // wxCharBuffer insists on NUL-terminating it
425 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
426 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
427 {
428 if ( outLen )
429 {
430 *outLen = dstLen;
431
432 const size_t nulLen = GetMBNulLen();
433 if ( dstLen >= nulLen &&
434 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
435 {
436 // in this case the output is NUL-terminated and we're not
437 // supposed to count NUL
438 *outLen -= nulLen;
439 }
440 }
441
442 return buf;
443 }
444 }
445
446 if ( outLen )
447 *outLen = 0;
448
449 return wxCharBuffer();
450 }
451
452 // ----------------------------------------------------------------------------
453 // wxMBConvLibc
454 // ----------------------------------------------------------------------------
455
456 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
457 {
458 return wxMB2WC(buf, psz, n);
459 }
460
461 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
462 {
463 return wxWC2MB(buf, psz, n);
464 }
465
466 // ----------------------------------------------------------------------------
467 // wxConvBrokenFileNames
468 // ----------------------------------------------------------------------------
469
470 #ifdef __UNIX__
471
472 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
473 {
474 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
475 wxStricmp(charset, _T("UTF8")) == 0 )
476 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
477 else
478 m_conv = new wxCSConv(charset);
479 }
480
481 #endif // __UNIX__
482
483 // ----------------------------------------------------------------------------
484 // UTF-7
485 // ----------------------------------------------------------------------------
486
487 // Implementation (C) 2004 Fredrik Roubert
488
489 //
490 // BASE64 decoding table
491 //
492 static const unsigned char utf7unb64[] =
493 {
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
500 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
501 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
503 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
504 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
505 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
506 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
507 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
508 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
509 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
526 };
527
528 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
529 {
530 size_t len = 0;
531
532 while ( *psz && (!buf || (len < n)) )
533 {
534 unsigned char cc = *psz++;
535 if (cc != '+')
536 {
537 // plain ASCII char
538 if (buf)
539 *buf++ = cc;
540 len++;
541 }
542 else if (*psz == '-')
543 {
544 // encoded plus sign
545 if (buf)
546 *buf++ = cc;
547 len++;
548 psz++;
549 }
550 else // start of BASE64 encoded string
551 {
552 bool lsb, ok;
553 unsigned int d, l;
554 for ( ok = lsb = false, d = 0, l = 0;
555 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
556 psz++ )
557 {
558 d <<= 6;
559 d += cc;
560 for (l += 6; l >= 8; lsb = !lsb)
561 {
562 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
563 if (lsb)
564 {
565 if (buf)
566 *buf++ |= c;
567 len ++;
568 }
569 else
570 {
571 if (buf)
572 *buf = (wchar_t)(c << 8);
573 }
574
575 ok = true;
576 }
577 }
578
579 if ( !ok )
580 {
581 // in valid UTF7 we should have valid characters after '+'
582 return wxCONV_FAILED;
583 }
584
585 if (*psz == '-')
586 psz++;
587 }
588 }
589
590 if ( buf && (len < n) )
591 *buf = '\0';
592
593 return len;
594 }
595
596 //
597 // BASE64 encoding table
598 //
599 static const unsigned char utf7enb64[] =
600 {
601 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
602 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
603 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
604 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
605 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
606 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
607 'w', 'x', 'y', 'z', '0', '1', '2', '3',
608 '4', '5', '6', '7', '8', '9', '+', '/'
609 };
610
611 //
612 // UTF-7 encoding table
613 //
614 // 0 - Set D (directly encoded characters)
615 // 1 - Set O (optional direct characters)
616 // 2 - whitespace characters (optional)
617 // 3 - special characters
618 //
619 static const unsigned char utf7encode[128] =
620 {
621 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
622 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
623 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
625 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
627 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
629 };
630
631 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
632 {
633 size_t len = 0;
634
635 while (*psz && ((!buf) || (len < n)))
636 {
637 wchar_t cc = *psz++;
638 if (cc < 0x80 && utf7encode[cc] < 1)
639 {
640 // plain ASCII char
641 if (buf)
642 *buf++ = (char)cc;
643
644 len++;
645 }
646 #ifndef WC_UTF16
647 else if (((wxUint32)cc) > 0xffff)
648 {
649 // no surrogate pair generation (yet?)
650 return wxCONV_FAILED;
651 }
652 #endif
653 else
654 {
655 if (buf)
656 *buf++ = '+';
657
658 len++;
659 if (cc != '+')
660 {
661 // BASE64 encode string
662 unsigned int lsb, d, l;
663 for (d = 0, l = 0; /*nothing*/; psz++)
664 {
665 for (lsb = 0; lsb < 2; lsb ++)
666 {
667 d <<= 8;
668 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
669
670 for (l += 8; l >= 6; )
671 {
672 l -= 6;
673 if (buf)
674 *buf++ = utf7enb64[(d >> l) % 64];
675 len++;
676 }
677 }
678
679 cc = *psz;
680 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
681 break;
682 }
683
684 if (l != 0)
685 {
686 if (buf)
687 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
688
689 len++;
690 }
691 }
692
693 if (buf)
694 *buf++ = '-';
695 len++;
696 }
697 }
698
699 if (buf && (len < n))
700 *buf = 0;
701
702 return len;
703 }
704
705 // ----------------------------------------------------------------------------
706 // UTF-8
707 // ----------------------------------------------------------------------------
708
709 static wxUint32 utf8_max[]=
710 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
711
712 // boundaries of the private use area we use to (temporarily) remap invalid
713 // characters invalid in a UTF-8 encoded string
714 const wxUint32 wxUnicodePUA = 0x100000;
715 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
716
717 // this table gives the length of the UTF-8 encoding from its first character:
718 unsigned char tableUtf8Lengths[256] = {
719 // single-byte sequences (ASCII):
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
726 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
727 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
728
729 // these are invalid:
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
734 0, 0, // C0,C1
735
736 // two-byte sequences:
737 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
738 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
739
740 // three-byte sequences:
741 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
742
743 // four-byte sequences:
744 4, 4, 4, 4, 4, // F0..F4
745
746 // these are invalid again (5- or 6-byte
747 // sequences and sequences for code points
748 // above U+10FFFF, as restricted by RFC 3629):
749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
750 };
751
752 size_t
753 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
754 const char *src, size_t srcLen) const
755 {
756 wchar_t *out = dstLen ? dst : NULL;
757 size_t written = 0;
758
759 if ( srcLen == wxNO_LEN )
760 srcLen = strlen(src) + 1;
761
762 for ( const char *p = src; ; p++ )
763 {
764 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
765 {
766 // all done successfully, just add the trailing NULL if we are not
767 // using explicit length
768 if ( srcLen == wxNO_LEN )
769 {
770 if ( out )
771 {
772 if ( !dstLen )
773 break;
774
775 *out = L'\0';
776 }
777
778 written++;
779 }
780
781 return written;
782 }
783
784 unsigned char c = *p;
785 unsigned len = tableUtf8Lengths[c];
786 if ( !len )
787 break;
788
789 if ( srcLen < len ) // the test works for wxNO_LEN too
790 break;
791
792 if ( srcLen != wxNO_LEN )
793 srcLen -= len;
794
795 if ( out && !dstLen-- )
796 break;
797
798
799 // Char. number range | UTF-8 octet sequence
800 // (hexadecimal) | (binary)
801 // ----------------------+---------------------------------------------
802 // 0000 0000 - 0000 007F | 0xxxxxxx
803 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
804 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
805 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
806 //
807 // Code point value is stored in bits marked with 'x', lowest-order bit
808 // of the value on the right side in the diagram above.
809 // (from RFC 3629)
810
811 // mask to extract lead byte's value ('x' bits above), by sequence length:
812 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
813
814 // mask and value of lead byte's most significant bits, by length:
815 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
816 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
817
818 len--; // it's more convenient to work with 0-based length here
819
820 // extract the lead byte's value bits:
821 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
822 break;
823
824 wxUint32 code = c & leadValueMask[len];
825
826 // all remaining bytes, if any, are handled in the same way regardless of
827 // sequence's length:
828 for ( ; len; --len )
829 {
830 c = *++p;
831 if ( (c & 0xC0) != 0x80 )
832 return wxCONV_FAILED;
833
834 code <<= 6;
835 code |= c & 0x3F;
836 }
837
838 #ifdef WC_UTF16
839 // cast is ok because wchar_t == wxUint16 if WC_UTF16
840 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
841 {
842 if ( out )
843 out++;
844 written++;
845 }
846 #else // !WC_UTF16
847 if ( out )
848 *out = code;
849 #endif // WC_UTF16/!WC_UTF16
850
851 if ( out )
852 out++;
853
854 written++;
855 }
856
857 return wxCONV_FAILED;
858 }
859
860 size_t
861 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
862 const wchar_t *src, size_t srcLen) const
863 {
864 char *out = dstLen ? dst : NULL;
865 size_t written = 0;
866
867 for ( const wchar_t *wp = src; ; wp++ )
868 {
869 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
870 {
871 // all done successfully, just add the trailing NULL if we are not
872 // using explicit length
873 if ( srcLen == wxNO_LEN )
874 {
875 if ( out )
876 {
877 if ( !dstLen )
878 break;
879
880 *out = '\0';
881 }
882
883 written++;
884 }
885
886 return written;
887 }
888
889
890 wxUint32 code;
891 #ifdef WC_UTF16
892 // cast is ok for WC_UTF16
893 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
894 {
895 // skip the next char too as we decoded a surrogate
896 wp++;
897 }
898 #else // wchar_t is UTF-32
899 code = *wp & 0x7fffffff;
900 #endif
901
902 unsigned len;
903 if ( code <= 0x7F )
904 {
905 len = 1;
906 if ( out )
907 {
908 if ( dstLen < len )
909 break;
910
911 out[0] = (char)code;
912 }
913 }
914 else if ( code <= 0x07FF )
915 {
916 len = 2;
917 if ( out )
918 {
919 if ( dstLen < len )
920 break;
921
922 // NB: this line takes 6 least significant bits, encodes them as
923 // 10xxxxxx and discards them so that the next byte can be encoded:
924 out[1] = 0x80 | (code & 0x3F); code >>= 6;
925 out[0] = 0xC0 | code;
926 }
927 }
928 else if ( code < 0xFFFF )
929 {
930 len = 3;
931 if ( out )
932 {
933 if ( dstLen < len )
934 break;
935
936 out[2] = 0x80 | (code & 0x3F); code >>= 6;
937 out[1] = 0x80 | (code & 0x3F); code >>= 6;
938 out[0] = 0xE0 | code;
939 }
940 }
941 else if ( code <= 0x10FFFF )
942 {
943 len = 4;
944 if ( out )
945 {
946 if ( dstLen < len )
947 break;
948
949 out[3] = 0x80 | (code & 0x3F); code >>= 6;
950 out[2] = 0x80 | (code & 0x3F); code >>= 6;
951 out[1] = 0x80 | (code & 0x3F); code >>= 6;
952 out[0] = 0xF0 | code;
953 }
954 }
955 else
956 {
957 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
958 break;
959 }
960
961 if ( out )
962 {
963 out += len;
964 dstLen -= len;
965 }
966
967 written += len;
968 }
969
970 // we only get here if an error occurs during decoding
971 return wxCONV_FAILED;
972 }
973
974 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
975 {
976 if ( m_options == MAP_INVALID_UTF8_NOT )
977 return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
978
979 size_t len = 0;
980
981 while (*psz && ((!buf) || (len < n)))
982 {
983 const char *opsz = psz;
984 bool invalid = false;
985 unsigned char cc = *psz++, fc = cc;
986 unsigned cnt;
987 for (cnt = 0; fc & 0x80; cnt++)
988 fc <<= 1;
989
990 if (!cnt)
991 {
992 // plain ASCII char
993 if (buf)
994 *buf++ = cc;
995 len++;
996
997 // escape the escape character for octal escapes
998 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
999 && cc == '\\' && (!buf || len < n))
1000 {
1001 if (buf)
1002 *buf++ = cc;
1003 len++;
1004 }
1005 }
1006 else
1007 {
1008 cnt--;
1009 if (!cnt)
1010 {
1011 // invalid UTF-8 sequence
1012 invalid = true;
1013 }
1014 else
1015 {
1016 unsigned ocnt = cnt - 1;
1017 wxUint32 res = cc & (0x3f >> cnt);
1018 while (cnt--)
1019 {
1020 cc = *psz;
1021 if ((cc & 0xC0) != 0x80)
1022 {
1023 // invalid UTF-8 sequence
1024 invalid = true;
1025 break;
1026 }
1027
1028 psz++;
1029 res = (res << 6) | (cc & 0x3f);
1030 }
1031
1032 if (invalid || res <= utf8_max[ocnt])
1033 {
1034 // illegal UTF-8 encoding
1035 invalid = true;
1036 }
1037 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1038 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1039 {
1040 // if one of our PUA characters turns up externally
1041 // it must also be treated as an illegal sequence
1042 // (a bit like you have to escape an escape character)
1043 invalid = true;
1044 }
1045 else
1046 {
1047 #ifdef WC_UTF16
1048 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1049 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1050 if (pa == wxCONV_FAILED)
1051 {
1052 invalid = true;
1053 }
1054 else
1055 {
1056 if (buf)
1057 buf += pa;
1058 len += pa;
1059 }
1060 #else // !WC_UTF16
1061 if (buf)
1062 *buf++ = (wchar_t)res;
1063 len++;
1064 #endif // WC_UTF16/!WC_UTF16
1065 }
1066 }
1067
1068 if (invalid)
1069 {
1070 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1071 {
1072 while (opsz < psz && (!buf || len < n))
1073 {
1074 #ifdef WC_UTF16
1075 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1076 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1077 wxASSERT(pa != wxCONV_FAILED);
1078 if (buf)
1079 buf += pa;
1080 opsz++;
1081 len += pa;
1082 #else
1083 if (buf)
1084 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1085 opsz++;
1086 len++;
1087 #endif
1088 }
1089 }
1090 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1091 {
1092 while (opsz < psz && (!buf || len < n))
1093 {
1094 if ( buf && len + 3 < n )
1095 {
1096 unsigned char on = *opsz;
1097 *buf++ = L'\\';
1098 *buf++ = (wchar_t)( L'0' + on / 0100 );
1099 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1100 *buf++ = (wchar_t)( L'0' + on % 010 );
1101 }
1102
1103 opsz++;
1104 len += 4;
1105 }
1106 }
1107 else // MAP_INVALID_UTF8_NOT
1108 {
1109 return wxCONV_FAILED;
1110 }
1111 }
1112 }
1113 }
1114
1115 if (buf && (len < n))
1116 *buf = 0;
1117
1118 return len;
1119 }
1120
1121 static inline bool isoctal(wchar_t wch)
1122 {
1123 return L'0' <= wch && wch <= L'7';
1124 }
1125
1126 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1127 {
1128 if ( m_options == MAP_INVALID_UTF8_NOT )
1129 return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
1130
1131 size_t len = 0;
1132
1133 while (*psz && ((!buf) || (len < n)))
1134 {
1135 wxUint32 cc;
1136
1137 #ifdef WC_UTF16
1138 // cast is ok for WC_UTF16
1139 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1140 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1141 #else
1142 cc = (*psz++) & 0x7fffffff;
1143 #endif
1144
1145 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1146 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1147 {
1148 if (buf)
1149 *buf++ = (char)(cc - wxUnicodePUA);
1150 len++;
1151 }
1152 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1153 && cc == L'\\' && psz[0] == L'\\' )
1154 {
1155 if (buf)
1156 *buf++ = (char)cc;
1157 psz++;
1158 len++;
1159 }
1160 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1161 cc == L'\\' &&
1162 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1163 {
1164 if (buf)
1165 {
1166 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1167 (psz[1] - L'0') * 010 +
1168 (psz[2] - L'0'));
1169 }
1170
1171 psz += 3;
1172 len++;
1173 }
1174 else
1175 {
1176 unsigned cnt;
1177 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1178 {
1179 }
1180
1181 if (!cnt)
1182 {
1183 // plain ASCII char
1184 if (buf)
1185 *buf++ = (char) cc;
1186 len++;
1187 }
1188 else
1189 {
1190 len += cnt + 1;
1191 if (buf)
1192 {
1193 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1194 while (cnt--)
1195 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1196 }
1197 }
1198 }
1199 }
1200
1201 if (buf && (len < n))
1202 *buf = 0;
1203
1204 return len;
1205 }
1206
1207 // ============================================================================
1208 // UTF-16
1209 // ============================================================================
1210
1211 #ifdef WORDS_BIGENDIAN
1212 #define wxMBConvUTF16straight wxMBConvUTF16BE
1213 #define wxMBConvUTF16swap wxMBConvUTF16LE
1214 #else
1215 #define wxMBConvUTF16swap wxMBConvUTF16BE
1216 #define wxMBConvUTF16straight wxMBConvUTF16LE
1217 #endif
1218
1219 /* static */
1220 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1221 {
1222 if ( srcLen == wxNO_LEN )
1223 {
1224 // count the number of bytes in input, including the trailing NULs
1225 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1226 for ( srcLen = 1; *inBuff++; srcLen++ )
1227 ;
1228
1229 srcLen *= BYTES_PER_CHAR;
1230 }
1231 else // we already have the length
1232 {
1233 // we can only convert an entire number of UTF-16 characters
1234 if ( srcLen % BYTES_PER_CHAR )
1235 return wxCONV_FAILED;
1236 }
1237
1238 return srcLen;
1239 }
1240
1241 // case when in-memory representation is UTF-16 too
1242 #ifdef WC_UTF16
1243
1244 // ----------------------------------------------------------------------------
1245 // conversions without endianness change
1246 // ----------------------------------------------------------------------------
1247
1248 size_t
1249 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1250 const char *src, size_t srcLen) const
1251 {
1252 // set up the scene for using memcpy() (which is presumably more efficient
1253 // than copying the bytes one by one)
1254 srcLen = GetLength(src, srcLen);
1255 if ( srcLen == wxNO_LEN )
1256 return wxCONV_FAILED;
1257
1258 const size_t inLen = srcLen / BYTES_PER_CHAR;
1259 if ( dst )
1260 {
1261 if ( dstLen < inLen )
1262 return wxCONV_FAILED;
1263
1264 memcpy(dst, src, srcLen);
1265 }
1266
1267 return inLen;
1268 }
1269
1270 size_t
1271 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1272 const wchar_t *src, size_t srcLen) const
1273 {
1274 if ( srcLen == wxNO_LEN )
1275 srcLen = wxWcslen(src) + 1;
1276
1277 srcLen *= BYTES_PER_CHAR;
1278
1279 if ( dst )
1280 {
1281 if ( dstLen < srcLen )
1282 return wxCONV_FAILED;
1283
1284 memcpy(dst, src, srcLen);
1285 }
1286
1287 return srcLen;
1288 }
1289
1290 // ----------------------------------------------------------------------------
1291 // endian-reversing conversions
1292 // ----------------------------------------------------------------------------
1293
1294 size_t
1295 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1296 const char *src, size_t srcLen) const
1297 {
1298 srcLen = GetLength(src, srcLen);
1299 if ( srcLen == wxNO_LEN )
1300 return wxCONV_FAILED;
1301
1302 srcLen /= BYTES_PER_CHAR;
1303
1304 if ( dst )
1305 {
1306 if ( dstLen < srcLen )
1307 return wxCONV_FAILED;
1308
1309 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1310 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1311 {
1312 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1313 }
1314 }
1315
1316 return srcLen;
1317 }
1318
1319 size_t
1320 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1321 const wchar_t *src, size_t srcLen) const
1322 {
1323 if ( srcLen == wxNO_LEN )
1324 srcLen = wxWcslen(src) + 1;
1325
1326 srcLen *= BYTES_PER_CHAR;
1327
1328 if ( dst )
1329 {
1330 if ( dstLen < srcLen )
1331 return wxCONV_FAILED;
1332
1333 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1334 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1335 {
1336 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1337 }
1338 }
1339
1340 return srcLen;
1341 }
1342
1343 #else // !WC_UTF16: wchar_t is UTF-32
1344
1345 // ----------------------------------------------------------------------------
1346 // conversions without endianness change
1347 // ----------------------------------------------------------------------------
1348
1349 size_t
1350 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1351 const char *src, size_t srcLen) const
1352 {
1353 srcLen = GetLength(src, srcLen);
1354 if ( srcLen == wxNO_LEN )
1355 return wxCONV_FAILED;
1356
1357 const size_t inLen = srcLen / BYTES_PER_CHAR;
1358 if ( !dst )
1359 {
1360 // optimization: return maximal space which could be needed for this
1361 // string even if the real size could be smaller if the buffer contains
1362 // any surrogates
1363 return inLen;
1364 }
1365
1366 size_t outLen = 0;
1367 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1368 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1369 {
1370 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1371 if ( !inBuff )
1372 return wxCONV_FAILED;
1373
1374 if ( ++outLen > dstLen )
1375 return wxCONV_FAILED;
1376
1377 *dst++ = ch;
1378 }
1379
1380
1381 return outLen;
1382 }
1383
1384 size_t
1385 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1386 const wchar_t *src, size_t srcLen) const
1387 {
1388 if ( srcLen == wxNO_LEN )
1389 srcLen = wxWcslen(src) + 1;
1390
1391 size_t outLen = 0;
1392 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1393 for ( size_t n = 0; n < srcLen; n++ )
1394 {
1395 wxUint16 cc[2];
1396 const size_t numChars = encode_utf16(*src++, cc);
1397 if ( numChars == wxCONV_FAILED )
1398 return wxCONV_FAILED;
1399
1400 outLen += numChars * BYTES_PER_CHAR;
1401 if ( outBuff )
1402 {
1403 if ( outLen > dstLen )
1404 return wxCONV_FAILED;
1405
1406 *outBuff++ = cc[0];
1407 if ( numChars == 2 )
1408 {
1409 // second character of a surrogate
1410 *outBuff++ = cc[1];
1411 }
1412 }
1413 }
1414
1415 return outLen;
1416 }
1417
1418 // ----------------------------------------------------------------------------
1419 // endian-reversing conversions
1420 // ----------------------------------------------------------------------------
1421
1422 size_t
1423 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1424 const char *src, size_t srcLen) const
1425 {
1426 srcLen = GetLength(src, srcLen);
1427 if ( srcLen == wxNO_LEN )
1428 return wxCONV_FAILED;
1429
1430 const size_t inLen = srcLen / BYTES_PER_CHAR;
1431 if ( !dst )
1432 {
1433 // optimization: return maximal space which could be needed for this
1434 // string even if the real size could be smaller if the buffer contains
1435 // any surrogates
1436 return inLen;
1437 }
1438
1439 size_t outLen = 0;
1440 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1441 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1442 {
1443 wxUint32 ch;
1444 wxUint16 tmp[2];
1445
1446 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1447 inBuff++;
1448 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1449
1450 const size_t numChars = decode_utf16(tmp, ch);
1451 if ( numChars == wxCONV_FAILED )
1452 return wxCONV_FAILED;
1453
1454 if ( numChars == 2 )
1455 inBuff++;
1456
1457 if ( ++outLen > dstLen )
1458 return wxCONV_FAILED;
1459
1460 *dst++ = ch;
1461 }
1462
1463
1464 return outLen;
1465 }
1466
1467 size_t
1468 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1469 const wchar_t *src, size_t srcLen) const
1470 {
1471 if ( srcLen == wxNO_LEN )
1472 srcLen = wxWcslen(src) + 1;
1473
1474 size_t outLen = 0;
1475 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1476 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1477 {
1478 wxUint16 cc[2];
1479 const size_t numChars = encode_utf16(*src, cc);
1480 if ( numChars == wxCONV_FAILED )
1481 return wxCONV_FAILED;
1482
1483 outLen += numChars * BYTES_PER_CHAR;
1484 if ( outBuff )
1485 {
1486 if ( outLen > dstLen )
1487 return wxCONV_FAILED;
1488
1489 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1490 if ( numChars == 2 )
1491 {
1492 // second character of a surrogate
1493 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1494 }
1495 }
1496 }
1497
1498 return outLen;
1499 }
1500
1501 #endif // WC_UTF16/!WC_UTF16
1502
1503
1504 // ============================================================================
1505 // UTF-32
1506 // ============================================================================
1507
1508 #ifdef WORDS_BIGENDIAN
1509 #define wxMBConvUTF32straight wxMBConvUTF32BE
1510 #define wxMBConvUTF32swap wxMBConvUTF32LE
1511 #else
1512 #define wxMBConvUTF32swap wxMBConvUTF32BE
1513 #define wxMBConvUTF32straight wxMBConvUTF32LE
1514 #endif
1515
1516
1517 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1518 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1519
1520 /* static */
1521 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1522 {
1523 if ( srcLen == wxNO_LEN )
1524 {
1525 // count the number of bytes in input, including the trailing NULs
1526 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1527 for ( srcLen = 1; *inBuff++; srcLen++ )
1528 ;
1529
1530 srcLen *= BYTES_PER_CHAR;
1531 }
1532 else // we already have the length
1533 {
1534 // we can only convert an entire number of UTF-32 characters
1535 if ( srcLen % BYTES_PER_CHAR )
1536 return wxCONV_FAILED;
1537 }
1538
1539 return srcLen;
1540 }
1541
1542 // case when in-memory representation is UTF-16
1543 #ifdef WC_UTF16
1544
1545 // ----------------------------------------------------------------------------
1546 // conversions without endianness change
1547 // ----------------------------------------------------------------------------
1548
1549 size_t
1550 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1551 const char *src, size_t srcLen) const
1552 {
1553 srcLen = GetLength(src, srcLen);
1554 if ( srcLen == wxNO_LEN )
1555 return wxCONV_FAILED;
1556
1557 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1558 const size_t inLen = srcLen / BYTES_PER_CHAR;
1559 size_t outLen = 0;
1560 for ( size_t n = 0; n < inLen; n++ )
1561 {
1562 wxUint16 cc[2];
1563 const size_t numChars = encode_utf16(*inBuff++, cc);
1564 if ( numChars == wxCONV_FAILED )
1565 return wxCONV_FAILED;
1566
1567 outLen += numChars;
1568 if ( dst )
1569 {
1570 if ( outLen > dstLen )
1571 return wxCONV_FAILED;
1572
1573 *dst++ = cc[0];
1574 if ( numChars == 2 )
1575 {
1576 // second character of a surrogate
1577 *dst++ = cc[1];
1578 }
1579 }
1580 }
1581
1582 return outLen;
1583 }
1584
1585 size_t
1586 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1587 const wchar_t *src, size_t srcLen) const
1588 {
1589 if ( srcLen == wxNO_LEN )
1590 srcLen = wxWcslen(src) + 1;
1591
1592 if ( !dst )
1593 {
1594 // optimization: return maximal space which could be needed for this
1595 // string instead of the exact amount which could be less if there are
1596 // any surrogates in the input
1597 //
1598 // we consider that surrogates are rare enough to make it worthwhile to
1599 // avoid running the loop below at the cost of slightly extra memory
1600 // consumption
1601 return srcLen * BYTES_PER_CHAR;
1602 }
1603
1604 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1605 size_t outLen = 0;
1606 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1607 {
1608 const wxUint32 ch = wxDecodeSurrogate(&src);
1609 if ( !src )
1610 return wxCONV_FAILED;
1611
1612 outLen += BYTES_PER_CHAR;
1613
1614 if ( outLen > dstLen )
1615 return wxCONV_FAILED;
1616
1617 *outBuff++ = ch;
1618 }
1619
1620 return outLen;
1621 }
1622
1623 // ----------------------------------------------------------------------------
1624 // endian-reversing conversions
1625 // ----------------------------------------------------------------------------
1626
1627 size_t
1628 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1629 const char *src, size_t srcLen) const
1630 {
1631 srcLen = GetLength(src, srcLen);
1632 if ( srcLen == wxNO_LEN )
1633 return wxCONV_FAILED;
1634
1635 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1636 const size_t inLen = srcLen / BYTES_PER_CHAR;
1637 size_t outLen = 0;
1638 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1639 {
1640 wxUint16 cc[2];
1641 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1642 if ( numChars == wxCONV_FAILED )
1643 return wxCONV_FAILED;
1644
1645 outLen += numChars;
1646 if ( dst )
1647 {
1648 if ( outLen > dstLen )
1649 return wxCONV_FAILED;
1650
1651 *dst++ = cc[0];
1652 if ( numChars == 2 )
1653 {
1654 // second character of a surrogate
1655 *dst++ = cc[1];
1656 }
1657 }
1658 }
1659
1660 return outLen;
1661 }
1662
1663 size_t
1664 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1665 const wchar_t *src, size_t srcLen) const
1666 {
1667 if ( srcLen == wxNO_LEN )
1668 srcLen = wxWcslen(src) + 1;
1669
1670 if ( !dst )
1671 {
1672 // optimization: return maximal space which could be needed for this
1673 // string instead of the exact amount which could be less if there are
1674 // any surrogates in the input
1675 //
1676 // we consider that surrogates are rare enough to make it worthwhile to
1677 // avoid running the loop below at the cost of slightly extra memory
1678 // consumption
1679 return srcLen*BYTES_PER_CHAR;
1680 }
1681
1682 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1683 size_t outLen = 0;
1684 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1685 {
1686 const wxUint32 ch = wxDecodeSurrogate(&src);
1687 if ( !src )
1688 return wxCONV_FAILED;
1689
1690 outLen += BYTES_PER_CHAR;
1691
1692 if ( outLen > dstLen )
1693 return wxCONV_FAILED;
1694
1695 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1696 }
1697
1698 return outLen;
1699 }
1700
1701 #else // !WC_UTF16: wchar_t is UTF-32
1702
1703 // ----------------------------------------------------------------------------
1704 // conversions without endianness change
1705 // ----------------------------------------------------------------------------
1706
1707 size_t
1708 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1709 const char *src, size_t srcLen) const
1710 {
1711 // use memcpy() as it should be much faster than hand-written loop
1712 srcLen = GetLength(src, srcLen);
1713 if ( srcLen == wxNO_LEN )
1714 return wxCONV_FAILED;
1715
1716 const size_t inLen = srcLen/BYTES_PER_CHAR;
1717 if ( dst )
1718 {
1719 if ( dstLen < inLen )
1720 return wxCONV_FAILED;
1721
1722 memcpy(dst, src, srcLen);
1723 }
1724
1725 return inLen;
1726 }
1727
1728 size_t
1729 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1730 const wchar_t *src, size_t srcLen) const
1731 {
1732 if ( srcLen == wxNO_LEN )
1733 srcLen = wxWcslen(src) + 1;
1734
1735 srcLen *= BYTES_PER_CHAR;
1736
1737 if ( dst )
1738 {
1739 if ( dstLen < srcLen )
1740 return wxCONV_FAILED;
1741
1742 memcpy(dst, src, srcLen);
1743 }
1744
1745 return srcLen;
1746 }
1747
1748 // ----------------------------------------------------------------------------
1749 // endian-reversing conversions
1750 // ----------------------------------------------------------------------------
1751
1752 size_t
1753 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1754 const char *src, size_t srcLen) const
1755 {
1756 srcLen = GetLength(src, srcLen);
1757 if ( srcLen == wxNO_LEN )
1758 return wxCONV_FAILED;
1759
1760 srcLen /= BYTES_PER_CHAR;
1761
1762 if ( dst )
1763 {
1764 if ( dstLen < srcLen )
1765 return wxCONV_FAILED;
1766
1767 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1768 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1769 {
1770 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1771 }
1772 }
1773
1774 return srcLen;
1775 }
1776
1777 size_t
1778 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1779 const wchar_t *src, size_t srcLen) const
1780 {
1781 if ( srcLen == wxNO_LEN )
1782 srcLen = wxWcslen(src) + 1;
1783
1784 srcLen *= BYTES_PER_CHAR;
1785
1786 if ( dst )
1787 {
1788 if ( dstLen < srcLen )
1789 return wxCONV_FAILED;
1790
1791 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1792 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1793 {
1794 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1795 }
1796 }
1797
1798 return srcLen;
1799 }
1800
1801 #endif // WC_UTF16/!WC_UTF16
1802
1803
1804 // ============================================================================
1805 // The classes doing conversion using the iconv_xxx() functions
1806 // ============================================================================
1807
1808 #ifdef HAVE_ICONV
1809
1810 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1811 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1812 // (unless there's yet another bug in glibc) the only case when iconv()
1813 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1814 // left in the input buffer -- when _real_ error occurs,
1815 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1816 // iconv() failure.
1817 // [This bug does not appear in glibc 2.2.]
1818 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1819 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1820 (errno != E2BIG || bufLeft != 0))
1821 #else
1822 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1823 #endif
1824
1825 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1826
1827 #define ICONV_T_INVALID ((iconv_t)-1)
1828
1829 #if SIZEOF_WCHAR_T == 4
1830 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1831 #define WC_ENC wxFONTENCODING_UTF32
1832 #elif SIZEOF_WCHAR_T == 2
1833 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1834 #define WC_ENC wxFONTENCODING_UTF16
1835 #else // sizeof(wchar_t) != 2 nor 4
1836 // does this ever happen?
1837 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1838 #endif
1839
1840 // ----------------------------------------------------------------------------
1841 // wxMBConv_iconv: encapsulates an iconv character set
1842 // ----------------------------------------------------------------------------
1843
1844 class wxMBConv_iconv : public wxMBConv
1845 {
1846 public:
1847 wxMBConv_iconv(const char *name);
1848 virtual ~wxMBConv_iconv();
1849
1850 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1851 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1852
1853 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1854 virtual size_t GetMBNulLen() const;
1855
1856 #if wxUSE_UNICODE_UTF8
1857 virtual bool IsUTF8() const;
1858 #endif
1859
1860 virtual wxMBConv *Clone() const
1861 {
1862 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1863 p->m_minMBCharWidth = m_minMBCharWidth;
1864 return p;
1865 }
1866
1867 bool IsOk() const
1868 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1869
1870 protected:
1871 // the iconv handlers used to translate from multibyte
1872 // to wide char and in the other direction
1873 iconv_t m2w,
1874 w2m;
1875
1876 #if wxUSE_THREADS
1877 // guards access to m2w and w2m objects
1878 wxMutex m_iconvMutex;
1879 #endif
1880
1881 private:
1882 // the name (for iconv_open()) of a wide char charset -- if none is
1883 // available on this machine, it will remain NULL
1884 static wxString ms_wcCharsetName;
1885
1886 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1887 // different endian-ness than the native one
1888 static bool ms_wcNeedsSwap;
1889
1890
1891 // name of the encoding handled by this conversion
1892 wxString m_name;
1893
1894 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1895 // initially
1896 size_t m_minMBCharWidth;
1897 };
1898
1899 // make the constructor available for unit testing
1900 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1901 {
1902 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1903 if ( !result->IsOk() )
1904 {
1905 delete result;
1906 return 0;
1907 }
1908
1909 return result;
1910 }
1911
1912 wxString wxMBConv_iconv::ms_wcCharsetName;
1913 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1914
1915 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1916 : m_name(name)
1917 {
1918 m_minMBCharWidth = 0;
1919
1920 // check for charset that represents wchar_t:
1921 if ( ms_wcCharsetName.empty() )
1922 {
1923 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1924
1925 #if wxUSE_FONTMAP
1926 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1927 #else // !wxUSE_FONTMAP
1928 static const wxChar *names_static[] =
1929 {
1930 #if SIZEOF_WCHAR_T == 4
1931 _T("UCS-4"),
1932 #elif SIZEOF_WCHAR_T = 2
1933 _T("UCS-2"),
1934 #endif
1935 NULL
1936 };
1937 const wxChar **names = names_static;
1938 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1939
1940 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1941 {
1942 const wxString nameCS(*names);
1943
1944 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1945 wxString nameXE(nameCS);
1946
1947 #ifdef WORDS_BIGENDIAN
1948 nameXE += _T("BE");
1949 #else // little endian
1950 nameXE += _T("LE");
1951 #endif
1952
1953 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1954 nameXE.c_str());
1955
1956 m2w = iconv_open(nameXE.ToAscii(), name);
1957 if ( m2w == ICONV_T_INVALID )
1958 {
1959 // try charset w/o bytesex info (e.g. "UCS4")
1960 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1961 nameCS.c_str());
1962 m2w = iconv_open(nameCS.ToAscii(), name);
1963
1964 // and check for bytesex ourselves:
1965 if ( m2w != ICONV_T_INVALID )
1966 {
1967 char buf[2], *bufPtr;
1968 wchar_t wbuf[2], *wbufPtr;
1969 size_t insz, outsz;
1970 size_t res;
1971
1972 buf[0] = 'A';
1973 buf[1] = 0;
1974 wbuf[0] = 0;
1975 insz = 2;
1976 outsz = SIZEOF_WCHAR_T * 2;
1977 wbufPtr = wbuf;
1978 bufPtr = buf;
1979
1980 res = iconv(
1981 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1982 (char**)&wbufPtr, &outsz);
1983
1984 if (ICONV_FAILED(res, insz))
1985 {
1986 wxLogLastError(wxT("iconv"));
1987 wxLogError(_("Conversion to charset '%s' doesn't work."),
1988 nameCS.c_str());
1989 }
1990 else // ok, can convert to this encoding, remember it
1991 {
1992 ms_wcCharsetName = nameCS;
1993 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1994 }
1995 }
1996 }
1997 else // use charset not requiring byte swapping
1998 {
1999 ms_wcCharsetName = nameXE;
2000 }
2001 }
2002
2003 wxLogTrace(TRACE_STRCONV,
2004 wxT("iconv wchar_t charset is \"%s\"%s"),
2005 ms_wcCharsetName.empty() ? wxString("<none>")
2006 : ms_wcCharsetName,
2007 ms_wcNeedsSwap ? _T(" (needs swap)")
2008 : _T(""));
2009 }
2010 else // we already have ms_wcCharsetName
2011 {
2012 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2013 }
2014
2015 if ( ms_wcCharsetName.empty() )
2016 {
2017 w2m = ICONV_T_INVALID;
2018 }
2019 else
2020 {
2021 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2022 if ( w2m == ICONV_T_INVALID )
2023 {
2024 wxLogTrace(TRACE_STRCONV,
2025 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2026 ms_wcCharsetName.c_str(), name);
2027 }
2028 }
2029 }
2030
2031 wxMBConv_iconv::~wxMBConv_iconv()
2032 {
2033 if ( m2w != ICONV_T_INVALID )
2034 iconv_close(m2w);
2035 if ( w2m != ICONV_T_INVALID )
2036 iconv_close(w2m);
2037 }
2038
2039 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2040 {
2041 // find the string length: notice that must be done differently for
2042 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2043 size_t inbuf;
2044 const size_t nulLen = GetMBNulLen();
2045 switch ( nulLen )
2046 {
2047 default:
2048 return wxCONV_FAILED;
2049
2050 case 1:
2051 inbuf = strlen(psz); // arguably more optimized than our version
2052 break;
2053
2054 case 2:
2055 case 4:
2056 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2057 // they also have to start at character boundary and not span two
2058 // adjacent characters
2059 const char *p;
2060 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2061 ;
2062 inbuf = p - psz;
2063 break;
2064 }
2065
2066 #if wxUSE_THREADS
2067 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2068 // Unfortunately there are a couple of global wxCSConv objects such as
2069 // wxConvLocal that are used all over wx code, so we have to make sure
2070 // the handle is used by at most one thread at the time. Otherwise
2071 // only a few wx classes would be safe to use from non-main threads
2072 // as MB<->WC conversion would fail "randomly".
2073 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2074 #endif // wxUSE_THREADS
2075
2076 size_t outbuf = n * SIZEOF_WCHAR_T;
2077 size_t res, cres;
2078 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2079 wchar_t *bufPtr = buf;
2080 const char *pszPtr = psz;
2081
2082 if (buf)
2083 {
2084 // have destination buffer, convert there
2085 cres = iconv(m2w,
2086 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2087 (char**)&bufPtr, &outbuf);
2088 res = n - (outbuf / SIZEOF_WCHAR_T);
2089
2090 if (ms_wcNeedsSwap)
2091 {
2092 // convert to native endianness
2093 for ( unsigned i = 0; i < res; i++ )
2094 buf[n] = WC_BSWAP(buf[i]);
2095 }
2096
2097 // NUL-terminate the string if there is any space left
2098 if (res < n)
2099 buf[res] = 0;
2100 }
2101 else
2102 {
2103 // no destination buffer... convert using temp buffer
2104 // to calculate destination buffer requirement
2105 wchar_t tbuf[8];
2106 res = 0;
2107
2108 do
2109 {
2110 bufPtr = tbuf;
2111 outbuf = 8 * SIZEOF_WCHAR_T;
2112
2113 cres = iconv(m2w,
2114 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2115 (char**)&bufPtr, &outbuf );
2116
2117 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2118 }
2119 while ((cres == (size_t)-1) && (errno == E2BIG));
2120 }
2121
2122 if (ICONV_FAILED(cres, inbuf))
2123 {
2124 //VS: it is ok if iconv fails, hence trace only
2125 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2126 return wxCONV_FAILED;
2127 }
2128
2129 return res;
2130 }
2131
2132 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2133 {
2134 #if wxUSE_THREADS
2135 // NB: explained in MB2WC
2136 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2137 #endif
2138
2139 size_t inlen = wxWcslen(psz);
2140 size_t inbuf = inlen * SIZEOF_WCHAR_T;
2141 size_t outbuf = n;
2142 size_t res, cres;
2143
2144 wchar_t *tmpbuf = 0;
2145
2146 if (ms_wcNeedsSwap)
2147 {
2148 // need to copy to temp buffer to switch endianness
2149 // (doing WC_BSWAP twice on the original buffer won't help, as it
2150 // could be in read-only memory, or be accessed in some other thread)
2151 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2152 for ( size_t i = 0; i < inlen; i++ )
2153 tmpbuf[n] = WC_BSWAP(psz[i]);
2154
2155 tmpbuf[inlen] = L'\0';
2156 psz = tmpbuf;
2157 }
2158
2159 if (buf)
2160 {
2161 // have destination buffer, convert there
2162 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2163
2164 res = n - outbuf;
2165
2166 // NB: iconv was given only wcslen(psz) characters on input, and so
2167 // it couldn't convert the trailing zero. Let's do it ourselves
2168 // if there's some room left for it in the output buffer.
2169 if (res < n)
2170 buf[0] = 0;
2171 }
2172 else
2173 {
2174 // no destination buffer: convert using temp buffer
2175 // to calculate destination buffer requirement
2176 char tbuf[16];
2177 res = 0;
2178 do
2179 {
2180 buf = tbuf;
2181 outbuf = 16;
2182
2183 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2184
2185 res += 16 - outbuf;
2186 }
2187 while ((cres == (size_t)-1) && (errno == E2BIG));
2188 }
2189
2190 if (ms_wcNeedsSwap)
2191 {
2192 free(tmpbuf);
2193 }
2194
2195 if (ICONV_FAILED(cres, inbuf))
2196 {
2197 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2198 return wxCONV_FAILED;
2199 }
2200
2201 return res;
2202 }
2203
2204 size_t wxMBConv_iconv::GetMBNulLen() const
2205 {
2206 if ( m_minMBCharWidth == 0 )
2207 {
2208 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2209
2210 #if wxUSE_THREADS
2211 // NB: explained in MB2WC
2212 wxMutexLocker lock(self->m_iconvMutex);
2213 #endif
2214
2215 const wchar_t *wnul = L"";
2216 char buf[8]; // should be enough for NUL in any encoding
2217 size_t inLen = sizeof(wchar_t),
2218 outLen = WXSIZEOF(buf);
2219 char *inBuff = (char *)wnul;
2220 char *outBuff = buf;
2221 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2222 {
2223 self->m_minMBCharWidth = (size_t)-1;
2224 }
2225 else // ok
2226 {
2227 self->m_minMBCharWidth = outBuff - buf;
2228 }
2229 }
2230
2231 return m_minMBCharWidth;
2232 }
2233
2234 #if wxUSE_UNICODE_UTF8
2235 bool wxMBConv_iconv::IsUTF8() const
2236 {
2237 return wxStricmp(m_name, "UTF-8") == 0 ||
2238 wxStricmp(m_name, "UTF8") == 0;
2239 }
2240 #endif
2241
2242 #endif // HAVE_ICONV
2243
2244
2245 // ============================================================================
2246 // Win32 conversion classes
2247 // ============================================================================
2248
2249 #ifdef wxHAVE_WIN32_MB2WC
2250
2251 // from utils.cpp
2252 #if wxUSE_FONTMAP
2253 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2254 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2255 #endif
2256
2257 class wxMBConv_win32 : public wxMBConv
2258 {
2259 public:
2260 wxMBConv_win32()
2261 {
2262 m_CodePage = CP_ACP;
2263 m_minMBCharWidth = 0;
2264 }
2265
2266 wxMBConv_win32(const wxMBConv_win32& conv)
2267 : wxMBConv()
2268 {
2269 m_CodePage = conv.m_CodePage;
2270 m_minMBCharWidth = conv.m_minMBCharWidth;
2271 }
2272
2273 #if wxUSE_FONTMAP
2274 wxMBConv_win32(const char* name)
2275 {
2276 m_CodePage = wxCharsetToCodepage(name);
2277 m_minMBCharWidth = 0;
2278 }
2279
2280 wxMBConv_win32(wxFontEncoding encoding)
2281 {
2282 m_CodePage = wxEncodingToCodepage(encoding);
2283 m_minMBCharWidth = 0;
2284 }
2285 #endif // wxUSE_FONTMAP
2286
2287 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2288 {
2289 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2290 // the behaviour is not compatible with the Unix version (using iconv)
2291 // and break the library itself, e.g. wxTextInputStream::NextChar()
2292 // wouldn't work if reading an incomplete MB char didn't result in an
2293 // error
2294 //
2295 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2296 // Win XP or newer and it is not supported for UTF-[78] so we always
2297 // use our own conversions in this case. See
2298 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2299 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2300 if ( m_CodePage == CP_UTF8 )
2301 {
2302 return wxMBConvUTF8().MB2WC(buf, psz, n);
2303 }
2304
2305 if ( m_CodePage == CP_UTF7 )
2306 {
2307 return wxMBConvUTF7().MB2WC(buf, psz, n);
2308 }
2309
2310 int flags = 0;
2311 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2312 IsAtLeastWin2kSP4() )
2313 {
2314 flags = MB_ERR_INVALID_CHARS;
2315 }
2316
2317 const size_t len = ::MultiByteToWideChar
2318 (
2319 m_CodePage, // code page
2320 flags, // flags: fall on error
2321 psz, // input string
2322 -1, // its length (NUL-terminated)
2323 buf, // output string
2324 buf ? n : 0 // size of output buffer
2325 );
2326 if ( !len )
2327 {
2328 // function totally failed
2329 return wxCONV_FAILED;
2330 }
2331
2332 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2333 // check if we succeeded, by doing a double trip:
2334 if ( !flags && buf )
2335 {
2336 const size_t mbLen = strlen(psz);
2337 wxCharBuffer mbBuf(mbLen);
2338 if ( ::WideCharToMultiByte
2339 (
2340 m_CodePage,
2341 0,
2342 buf,
2343 -1,
2344 mbBuf.data(),
2345 mbLen + 1, // size in bytes, not length
2346 NULL,
2347 NULL
2348 ) == 0 ||
2349 strcmp(mbBuf, psz) != 0 )
2350 {
2351 // we didn't obtain the same thing we started from, hence
2352 // the conversion was lossy and we consider that it failed
2353 return wxCONV_FAILED;
2354 }
2355 }
2356
2357 // note that it returns count of written chars for buf != NULL and size
2358 // of the needed buffer for buf == NULL so in either case the length of
2359 // the string (which never includes the terminating NUL) is one less
2360 return len - 1;
2361 }
2362
2363 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2364 {
2365 /*
2366 we have a problem here: by default, WideCharToMultiByte() may
2367 replace characters unrepresentable in the target code page with bad
2368 quality approximations such as turning "1/2" symbol (U+00BD) into
2369 "1" for the code pages which don't have it and we, obviously, want
2370 to avoid this at any price
2371
2372 the trouble is that this function does it _silently_, i.e. it won't
2373 even tell us whether it did or not... Win98/2000 and higher provide
2374 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2375 we have to resort to a round trip, i.e. check that converting back
2376 results in the same string -- this is, of course, expensive but
2377 otherwise we simply can't be sure to not garble the data.
2378 */
2379
2380 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2381 // it doesn't work with CJK encodings (which we test for rather roughly
2382 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2383 // supporting it
2384 BOOL usedDef wxDUMMY_INITIALIZE(false);
2385 BOOL *pUsedDef;
2386 int flags;
2387 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2388 {
2389 // it's our lucky day
2390 flags = WC_NO_BEST_FIT_CHARS;
2391 pUsedDef = &usedDef;
2392 }
2393 else // old system or unsupported encoding
2394 {
2395 flags = 0;
2396 pUsedDef = NULL;
2397 }
2398
2399 const size_t len = ::WideCharToMultiByte
2400 (
2401 m_CodePage, // code page
2402 flags, // either none or no best fit
2403 pwz, // input string
2404 -1, // it is (wide) NUL-terminated
2405 buf, // output buffer
2406 buf ? n : 0, // and its size
2407 NULL, // default "replacement" char
2408 pUsedDef // [out] was it used?
2409 );
2410
2411 if ( !len )
2412 {
2413 // function totally failed
2414 return wxCONV_FAILED;
2415 }
2416
2417 // if we were really converting, check if we succeeded
2418 if ( buf )
2419 {
2420 if ( flags )
2421 {
2422 // check if the conversion failed, i.e. if any replacements
2423 // were done
2424 if ( usedDef )
2425 return wxCONV_FAILED;
2426 }
2427 else // we must resort to double tripping...
2428 {
2429 wxWCharBuffer wcBuf(n);
2430 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2431 wcscmp(wcBuf, pwz) != 0 )
2432 {
2433 // we didn't obtain the same thing we started from, hence
2434 // the conversion was lossy and we consider that it failed
2435 return wxCONV_FAILED;
2436 }
2437 }
2438 }
2439
2440 // see the comment above for the reason of "len - 1"
2441 return len - 1;
2442 }
2443
2444 virtual size_t GetMBNulLen() const
2445 {
2446 if ( m_minMBCharWidth == 0 )
2447 {
2448 int len = ::WideCharToMultiByte
2449 (
2450 m_CodePage, // code page
2451 0, // no flags
2452 L"", // input string
2453 1, // translate just the NUL
2454 NULL, // output buffer
2455 0, // and its size
2456 NULL, // no replacement char
2457 NULL // [out] don't care if it was used
2458 );
2459
2460 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2461 switch ( len )
2462 {
2463 default:
2464 wxLogDebug(_T("Unexpected NUL length %d"), len);
2465 self->m_minMBCharWidth = (size_t)-1;
2466 break;
2467
2468 case 0:
2469 self->m_minMBCharWidth = (size_t)-1;
2470 break;
2471
2472 case 1:
2473 case 2:
2474 case 4:
2475 self->m_minMBCharWidth = len;
2476 break;
2477 }
2478 }
2479
2480 return m_minMBCharWidth;
2481 }
2482
2483 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2484
2485 bool IsOk() const { return m_CodePage != -1; }
2486
2487 private:
2488 static bool CanUseNoBestFit()
2489 {
2490 static int s_isWin98Or2k = -1;
2491
2492 if ( s_isWin98Or2k == -1 )
2493 {
2494 int verMaj, verMin;
2495 switch ( wxGetOsVersion(&verMaj, &verMin) )
2496 {
2497 case wxOS_WINDOWS_9X:
2498 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2499 break;
2500
2501 case wxOS_WINDOWS_NT:
2502 s_isWin98Or2k = verMaj >= 5;
2503 break;
2504
2505 default:
2506 // unknown: be conservative by default
2507 s_isWin98Or2k = 0;
2508 break;
2509 }
2510
2511 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2512 }
2513
2514 return s_isWin98Or2k == 1;
2515 }
2516
2517 static bool IsAtLeastWin2kSP4()
2518 {
2519 #ifdef __WXWINCE__
2520 return false;
2521 #else
2522 static int s_isAtLeastWin2kSP4 = -1;
2523
2524 if ( s_isAtLeastWin2kSP4 == -1 )
2525 {
2526 OSVERSIONINFOEX ver;
2527
2528 memset(&ver, 0, sizeof(ver));
2529 ver.dwOSVersionInfoSize = sizeof(ver);
2530 GetVersionEx((OSVERSIONINFO*)&ver);
2531
2532 s_isAtLeastWin2kSP4 =
2533 ((ver.dwMajorVersion > 5) || // Vista+
2534 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2535 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2536 ver.wServicePackMajor >= 4)) // 2000 SP4+
2537 ? 1 : 0;
2538 }
2539
2540 return s_isAtLeastWin2kSP4 == 1;
2541 #endif
2542 }
2543
2544
2545 // the code page we're working with
2546 long m_CodePage;
2547
2548 // cached result of GetMBNulLen(), set to 0 initially meaning
2549 // "unknown"
2550 size_t m_minMBCharWidth;
2551 };
2552
2553 #endif // wxHAVE_WIN32_MB2WC
2554
2555
2556 // ============================================================================
2557 // wxEncodingConverter based conversion classes
2558 // ============================================================================
2559
2560 #if wxUSE_FONTMAP
2561
2562 class wxMBConv_wxwin : public wxMBConv
2563 {
2564 private:
2565 void Init()
2566 {
2567 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2568 // The wxMBConv_cf class does a better job.
2569 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2570 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2571 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2572 }
2573
2574 public:
2575 // temporarily just use wxEncodingConverter stuff,
2576 // so that it works while a better implementation is built
2577 wxMBConv_wxwin(const char* name)
2578 {
2579 if (name)
2580 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2581 else
2582 m_enc = wxFONTENCODING_SYSTEM;
2583
2584 Init();
2585 }
2586
2587 wxMBConv_wxwin(wxFontEncoding enc)
2588 {
2589 m_enc = enc;
2590
2591 Init();
2592 }
2593
2594 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2595 {
2596 size_t inbuf = strlen(psz);
2597 if (buf)
2598 {
2599 if (!m2w.Convert(psz, buf))
2600 return wxCONV_FAILED;
2601 }
2602 return inbuf;
2603 }
2604
2605 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2606 {
2607 const size_t inbuf = wxWcslen(psz);
2608 if (buf)
2609 {
2610 if (!w2m.Convert(psz, buf))
2611 return wxCONV_FAILED;
2612 }
2613
2614 return inbuf;
2615 }
2616
2617 virtual size_t GetMBNulLen() const
2618 {
2619 switch ( m_enc )
2620 {
2621 case wxFONTENCODING_UTF16BE:
2622 case wxFONTENCODING_UTF16LE:
2623 return 2;
2624
2625 case wxFONTENCODING_UTF32BE:
2626 case wxFONTENCODING_UTF32LE:
2627 return 4;
2628
2629 default:
2630 return 1;
2631 }
2632 }
2633
2634 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2635
2636 bool IsOk() const { return m_ok; }
2637
2638 public:
2639 wxFontEncoding m_enc;
2640 wxEncodingConverter m2w, w2m;
2641
2642 private:
2643 // were we initialized successfully?
2644 bool m_ok;
2645
2646 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2647 };
2648
2649 // make the constructors available for unit testing
2650 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2651 {
2652 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2653 if ( !result->IsOk() )
2654 {
2655 delete result;
2656 return 0;
2657 }
2658
2659 return result;
2660 }
2661
2662 #endif // wxUSE_FONTMAP
2663
2664 // ============================================================================
2665 // wxCSConv implementation
2666 // ============================================================================
2667
2668 void wxCSConv::Init()
2669 {
2670 m_name = NULL;
2671 m_convReal = NULL;
2672 m_deferred = true;
2673 }
2674
2675 wxCSConv::wxCSConv(const wxString& charset)
2676 {
2677 Init();
2678
2679 if ( !charset.empty() )
2680 {
2681 SetName(charset.ToAscii());
2682 }
2683
2684 #if wxUSE_FONTMAP
2685 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2686 #else
2687 m_encoding = wxFONTENCODING_SYSTEM;
2688 #endif
2689 }
2690
2691 wxCSConv::wxCSConv(wxFontEncoding encoding)
2692 {
2693 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2694 {
2695 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2696
2697 encoding = wxFONTENCODING_SYSTEM;
2698 }
2699
2700 Init();
2701
2702 m_encoding = encoding;
2703 }
2704
2705 wxCSConv::~wxCSConv()
2706 {
2707 Clear();
2708 }
2709
2710 wxCSConv::wxCSConv(const wxCSConv& conv)
2711 : wxMBConv()
2712 {
2713 Init();
2714
2715 SetName(conv.m_name);
2716 m_encoding = conv.m_encoding;
2717 }
2718
2719 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2720 {
2721 Clear();
2722
2723 SetName(conv.m_name);
2724 m_encoding = conv.m_encoding;
2725
2726 return *this;
2727 }
2728
2729 void wxCSConv::Clear()
2730 {
2731 free(m_name);
2732 delete m_convReal;
2733
2734 m_name = NULL;
2735 m_convReal = NULL;
2736 }
2737
2738 void wxCSConv::SetName(const char *charset)
2739 {
2740 if (charset)
2741 {
2742 m_name = strdup(charset);
2743 m_deferred = true;
2744 }
2745 }
2746
2747 #if wxUSE_FONTMAP
2748
2749 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2750 wxEncodingNameCache );
2751
2752 static wxEncodingNameCache gs_nameCache;
2753 #endif
2754
2755 wxMBConv *wxCSConv::DoCreate() const
2756 {
2757 #if wxUSE_FONTMAP
2758 wxLogTrace(TRACE_STRCONV,
2759 wxT("creating conversion for %s"),
2760 (m_name ? m_name
2761 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2762 #endif // wxUSE_FONTMAP
2763
2764 // check for the special case of ASCII or ISO8859-1 charset: as we have
2765 // special knowledge of it anyhow, we don't need to create a special
2766 // conversion object
2767 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2768 m_encoding == wxFONTENCODING_DEFAULT )
2769 {
2770 // don't convert at all
2771 return NULL;
2772 }
2773
2774 // we trust OS to do conversion better than we can so try external
2775 // conversion methods first
2776 //
2777 // the full order is:
2778 // 1. OS conversion (iconv() under Unix or Win32 API)
2779 // 2. hard coded conversions for UTF
2780 // 3. wxEncodingConverter as fall back
2781
2782 // step (1)
2783 #ifdef HAVE_ICONV
2784 #if !wxUSE_FONTMAP
2785 if ( m_name )
2786 #endif // !wxUSE_FONTMAP
2787 {
2788 #if wxUSE_FONTMAP
2789 wxFontEncoding encoding(m_encoding);
2790 #endif
2791
2792 if ( m_name )
2793 {
2794 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2795 if ( conv->IsOk() )
2796 return conv;
2797
2798 delete conv;
2799
2800 #if wxUSE_FONTMAP
2801 encoding =
2802 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2803 #endif // wxUSE_FONTMAP
2804 }
2805 #if wxUSE_FONTMAP
2806 {
2807 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2808 if ( it != gs_nameCache.end() )
2809 {
2810 if ( it->second.empty() )
2811 return NULL;
2812
2813 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2814 if ( conv->IsOk() )
2815 return conv;
2816
2817 delete conv;
2818 }
2819
2820 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2821 // CS : in case this does not return valid names (eg for MacRoman)
2822 // encoding got a 'failure' entry in the cache all the same,
2823 // although it just has to be created using a different method, so
2824 // only store failed iconv creation attempts (or perhaps we
2825 // shoulnd't do this at all ?)
2826 if ( names[0] != NULL )
2827 {
2828 for ( ; *names; ++names )
2829 {
2830 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2831 // will need changes that will obsolete this
2832 wxString name(*names);
2833 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2834 if ( conv->IsOk() )
2835 {
2836 gs_nameCache[encoding] = *names;
2837 return conv;
2838 }
2839
2840 delete conv;
2841 }
2842
2843 gs_nameCache[encoding] = _T(""); // cache the failure
2844 }
2845 }
2846 #endif // wxUSE_FONTMAP
2847 }
2848 #endif // HAVE_ICONV
2849
2850 #ifdef wxHAVE_WIN32_MB2WC
2851 {
2852 #if wxUSE_FONTMAP
2853 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2854 : new wxMBConv_win32(m_encoding);
2855 if ( conv->IsOk() )
2856 return conv;
2857
2858 delete conv;
2859 #else
2860 return NULL;
2861 #endif
2862 }
2863 #endif // wxHAVE_WIN32_MB2WC
2864
2865 #ifdef __DARWIN__
2866 {
2867 // leave UTF16 and UTF32 to the built-ins of wx
2868 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2869 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2870 {
2871 #if wxUSE_FONTMAP
2872 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2873 : new wxMBConv_cf(m_encoding);
2874 #else
2875 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2876 #endif
2877
2878 if ( conv->IsOk() )
2879 return conv;
2880
2881 delete conv;
2882 }
2883 }
2884 #endif // __DARWIN__
2885
2886 // step (2)
2887 wxFontEncoding enc = m_encoding;
2888 #if wxUSE_FONTMAP
2889 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2890 {
2891 // use "false" to suppress interactive dialogs -- we can be called from
2892 // anywhere and popping up a dialog from here is the last thing we want to
2893 // do
2894 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2895 }
2896 #endif // wxUSE_FONTMAP
2897
2898 switch ( enc )
2899 {
2900 case wxFONTENCODING_UTF7:
2901 return new wxMBConvUTF7;
2902
2903 case wxFONTENCODING_UTF8:
2904 return new wxMBConvUTF8;
2905
2906 case wxFONTENCODING_UTF16BE:
2907 return new wxMBConvUTF16BE;
2908
2909 case wxFONTENCODING_UTF16LE:
2910 return new wxMBConvUTF16LE;
2911
2912 case wxFONTENCODING_UTF32BE:
2913 return new wxMBConvUTF32BE;
2914
2915 case wxFONTENCODING_UTF32LE:
2916 return new wxMBConvUTF32LE;
2917
2918 default:
2919 // nothing to do but put here to suppress gcc warnings
2920 break;
2921 }
2922
2923 // step (3)
2924 #if wxUSE_FONTMAP
2925 {
2926 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2927 : new wxMBConv_wxwin(m_encoding);
2928 if ( conv->IsOk() )
2929 return conv;
2930
2931 delete conv;
2932 }
2933 #endif // wxUSE_FONTMAP
2934
2935 // NB: This is a hack to prevent deadlock. What could otherwise happen
2936 // in Unicode build: wxConvLocal creation ends up being here
2937 // because of some failure and logs the error. But wxLog will try to
2938 // attach a timestamp, for which it will need wxConvLocal (to convert
2939 // time to char* and then wchar_t*), but that fails, tries to log the
2940 // error, but wxLog has an (already locked) critical section that
2941 // guards the static buffer.
2942 static bool alreadyLoggingError = false;
2943 if (!alreadyLoggingError)
2944 {
2945 alreadyLoggingError = true;
2946 wxLogError(_("Cannot convert from the charset '%s'!"),
2947 m_name ? m_name
2948 :
2949 #if wxUSE_FONTMAP
2950 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2951 #else // !wxUSE_FONTMAP
2952 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2953 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2954 );
2955
2956 alreadyLoggingError = false;
2957 }
2958
2959 return NULL;
2960 }
2961
2962 void wxCSConv::CreateConvIfNeeded() const
2963 {
2964 if ( m_deferred )
2965 {
2966 wxCSConv *self = (wxCSConv *)this; // const_cast
2967
2968 // if we don't have neither the name nor the encoding, use the default
2969 // encoding for this system
2970 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2971 {
2972 #if wxUSE_INTL
2973 self->m_encoding = wxLocale::GetSystemEncoding();
2974 #else
2975 // fallback to some reasonable default:
2976 self->m_encoding = wxFONTENCODING_ISO8859_1;
2977 #endif // wxUSE_INTL
2978 }
2979
2980 self->m_convReal = DoCreate();
2981 self->m_deferred = false;
2982 }
2983 }
2984
2985 bool wxCSConv::IsOk() const
2986 {
2987 CreateConvIfNeeded();
2988
2989 // special case: no convReal created for wxFONTENCODING_ISO8859_1
2990 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2991 return true; // always ok as we do it ourselves
2992
2993 // m_convReal->IsOk() is called at its own creation, so we know it must
2994 // be ok if m_convReal is non-NULL
2995 return m_convReal != NULL;
2996 }
2997
2998 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
2999 const char *src, size_t srcLen) const
3000 {
3001 CreateConvIfNeeded();
3002
3003 if (m_convReal)
3004 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3005
3006 // latin-1 (direct)
3007 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3008 }
3009
3010 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3011 const wchar_t *src, size_t srcLen) const
3012 {
3013 CreateConvIfNeeded();
3014
3015 if (m_convReal)
3016 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3017
3018 // latin-1 (direct)
3019 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3020 }
3021
3022 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3023 {
3024 CreateConvIfNeeded();
3025
3026 if (m_convReal)
3027 return m_convReal->MB2WC(buf, psz, n);
3028
3029 // latin-1 (direct)
3030 size_t len = strlen(psz);
3031
3032 if (buf)
3033 {
3034 for (size_t c = 0; c <= len; c++)
3035 buf[c] = (unsigned char)(psz[c]);
3036 }
3037
3038 return len;
3039 }
3040
3041 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3042 {
3043 CreateConvIfNeeded();
3044
3045 if (m_convReal)
3046 return m_convReal->WC2MB(buf, psz, n);
3047
3048 // latin-1 (direct)
3049 const size_t len = wxWcslen(psz);
3050 if (buf)
3051 {
3052 for (size_t c = 0; c <= len; c++)
3053 {
3054 if (psz[c] > 0xFF)
3055 return wxCONV_FAILED;
3056
3057 buf[c] = (char)psz[c];
3058 }
3059 }
3060 else
3061 {
3062 for (size_t c = 0; c <= len; c++)
3063 {
3064 if (psz[c] > 0xFF)
3065 return wxCONV_FAILED;
3066 }
3067 }
3068
3069 return len;
3070 }
3071
3072 size_t wxCSConv::GetMBNulLen() const
3073 {
3074 CreateConvIfNeeded();
3075
3076 if ( m_convReal )
3077 {
3078 return m_convReal->GetMBNulLen();
3079 }
3080
3081 // otherwise, we are ISO-8859-1
3082 return 1;
3083 }
3084
3085 #if wxUSE_UNICODE_UTF8
3086 bool wxCSConv::IsUTF8() const
3087 {
3088 CreateConvIfNeeded();
3089
3090 if ( m_convReal )
3091 {
3092 return m_convReal->IsUTF8();
3093 }
3094
3095 // otherwise, we are ISO-8859-1
3096 return false;
3097 }
3098 #endif
3099
3100
3101 #if wxUSE_UNICODE
3102
3103 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3104 {
3105 if ( !s )
3106 return wxWCharBuffer();
3107
3108 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3109 if ( !wbuf )
3110 wbuf = wxMBConvUTF8().cMB2WX(s);
3111 if ( !wbuf )
3112 wbuf = wxConvISO8859_1.cMB2WX(s);
3113
3114 return wbuf;
3115 }
3116
3117 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3118 {
3119 if ( !ws )
3120 return wxCharBuffer();
3121
3122 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3123 if ( !buf )
3124 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3125
3126 return buf;
3127 }
3128
3129 #endif // wxUSE_UNICODE
3130
3131 // ----------------------------------------------------------------------------
3132 // globals
3133 // ----------------------------------------------------------------------------
3134
3135 // NB: The reason why we create converted objects in this convoluted way,
3136 // using a factory function instead of global variable, is that they
3137 // may be used at static initialization time (some of them are used by
3138 // wxString ctors and there may be a global wxString object). In other
3139 // words, possibly _before_ the converter global object would be
3140 // initialized.
3141
3142 #undef wxConvLibc
3143 #undef wxConvUTF8
3144 #undef wxConvUTF7
3145 #undef wxConvLocal
3146 #undef wxConvISO8859_1
3147
3148 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3149 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3150 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3151 { \
3152 static impl_klass name##Obj ctor_args; \
3153 return &name##Obj; \
3154 } \
3155 /* this ensures that all global converter objects are created */ \
3156 /* by the time static initialization is done, i.e. before any */ \
3157 /* thread is launched: */ \
3158 static klass* gs_##name##instance = wxGet_##name##Ptr()
3159
3160 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3161 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3162
3163 #ifdef __WINDOWS__
3164 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3165 #else
3166 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3167 #endif
3168
3169 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3170 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3171
3172 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3173 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3174
3175 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3176 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3177
3178 #ifdef __DARWIN__
3179 // The xnu kernel always communicates file paths in decomposed UTF-8.
3180 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3181 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3182 #endif
3183
3184 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3185 #ifdef __DARWIN__
3186 &wxConvMacUTF8DObj;
3187 #else // !__DARWIN__
3188 wxGet_wxConvLibcPtr();
3189 #endif // __DARWIN__/!__DARWIN__
3190
3191 #else // !wxUSE_WCHAR_T
3192
3193 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3194 // stand-ins in absence of wchar_t
3195 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3196 wxConvISO8859_1,
3197 wxConvLocal,
3198 wxConvUTF8;
3199
3200 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T