]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
don't use 8 bit characters in sources, this results in level 1 warning with VC8 ...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef __SALFORDC__
48 #include <clib.h>
49 #endif
50
51 #ifdef HAVE_ICONV
52 #include <iconv.h>
53 #include "wx/thread.h"
54 #endif
55
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
58
59 #ifdef __DARWIN__
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
62
63
64 #define TRACE_STRCONV _T("strconv")
65
66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
67 // be 4 bytes
68 #if SIZEOF_WCHAR_T == 2
69 #define WC_UTF16
70 #endif
71
72
73 // ============================================================================
74 // implementation
75 // ============================================================================
76
77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
78 static bool NotAllNULs(const char *p, size_t n)
79 {
80 while ( n && *p++ == '\0' )
81 n--;
82
83 return n != 0;
84 }
85
86 // ----------------------------------------------------------------------------
87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
88 // ----------------------------------------------------------------------------
89
90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
91 {
92 if (input <= 0xffff)
93 {
94 if (output)
95 *output = (wxUint16) input;
96
97 return 1;
98 }
99 else if (input >= 0x110000)
100 {
101 return wxCONV_FAILED;
102 }
103 else
104 {
105 if (output)
106 {
107 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
108 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
109 }
110
111 return 2;
112 }
113 }
114
115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
116 {
117 if ((*input < 0xd800) || (*input > 0xdfff))
118 {
119 output = *input;
120 return 1;
121 }
122 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
123 {
124 output = *input;
125 return wxCONV_FAILED;
126 }
127 else
128 {
129 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
130 return 2;
131 }
132 }
133
134 #ifdef WC_UTF16
135 typedef wchar_t wxDecodeSurrogate_t;
136 #else // !WC_UTF16
137 typedef wxUint16 wxDecodeSurrogate_t;
138 #endif // WC_UTF16/!WC_UTF16
139
140 // returns the next UTF-32 character from the wchar_t buffer and advances the
141 // pointer to the character after this one
142 //
143 // if an invalid character is found, *pSrc is set to NULL, the caller must
144 // check for this
145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
146 {
147 wxUint32 out;
148 const size_t
149 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
150 if ( n == wxCONV_FAILED )
151 *pSrc = NULL;
152 else
153 *pSrc += n;
154
155 return out;
156 }
157
158 // ----------------------------------------------------------------------------
159 // wxMBConv
160 // ----------------------------------------------------------------------------
161
162 size_t
163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
164 const char *src, size_t srcLen) const
165 {
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
172
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten = 0;
175
176 // the number of NULs terminating this string
177 size_t nulLen = 0; // not really needed, but just to avoid warnings
178
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
182 // NULs at the end
183 wxCharBuffer bufTmp;
184 const char *srcEnd;
185 if ( srcLen != wxNO_LEN )
186 {
187 // we need to know how to find the end of this string
188 nulLen = GetMBNulLen();
189 if ( nulLen == wxCONV_FAILED )
190 return wxCONV_FAILED;
191
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
194 {
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
197 char * const p = bufTmp.data();
198 memcpy(p, src, srcLen);
199 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
200 *s = '\0';
201
202 src = bufTmp;
203 }
204
205 srcEnd = src + srcLen;
206 }
207 else // quit after the first loop iteration
208 {
209 srcEnd = NULL;
210 }
211
212 for ( ;; )
213 {
214 // try to convert the current chunk
215 size_t lenChunk = MB2WC(NULL, src, 0);
216 if ( lenChunk == wxCONV_FAILED )
217 return wxCONV_FAILED;
218
219 lenChunk++; // for the L'\0' at the end of this chunk
220
221 dstWritten += lenChunk;
222
223 if ( lenChunk == 1 )
224 {
225 // nothing left in the input string, conversion succeeded
226 break;
227 }
228
229 if ( dst )
230 {
231 if ( dstWritten > dstLen )
232 return wxCONV_FAILED;
233
234 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
235 return wxCONV_FAILED;
236
237 dst += lenChunk;
238 }
239
240 if ( !srcEnd )
241 {
242 // we convert just one chunk in this case as this is the entire
243 // string anyhow
244 break;
245 }
246
247 // advance the input pointer past the end of this chunk
248 while ( NotAllNULs(src, nulLen) )
249 {
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
254 src += nulLen;
255 }
256
257 src += nulLen; // skipping over its terminator as well
258
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
262 if ( src >= srcEnd )
263 break;
264 }
265
266 return dstWritten;
267 }
268
269 size_t
270 wxMBConv::FromWChar(char *dst, size_t dstLen,
271 const wchar_t *src, size_t srcLen) const
272 {
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten = 0;
275
276 // make a copy of the input string unless it is already properly
277 // NUL-terminated
278 //
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp;
282 if ( srcLen == wxNO_LEN )
283 {
284 srcLen = wxWcslen(src) + 1;
285 }
286 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
287 {
288 // make a copy in order to properly NUL-terminate the string
289 bufTmp = wxWCharBuffer(srcLen);
290 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
291 src = bufTmp;
292 }
293
294 const size_t lenNul = GetMBNulLen();
295 for ( const wchar_t * const srcEnd = src + srcLen;
296 src < srcEnd;
297 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
298 {
299 // try to convert the current chunk
300 size_t lenChunk = WC2MB(NULL, src, 0);
301
302 if ( lenChunk == wxCONV_FAILED )
303 return wxCONV_FAILED;
304
305 lenChunk += lenNul;
306 dstWritten += lenChunk;
307
308 if ( dst )
309 {
310 if ( dstWritten > dstLen )
311 return wxCONV_FAILED;
312
313 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 dst += lenChunk;
317 }
318 }
319
320 return dstWritten;
321 }
322
323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
324 {
325 size_t rc = ToWChar(outBuff, outLen, inBuff);
326 if ( rc != wxCONV_FAILED )
327 {
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
330 rc--;
331 }
332
333 return rc;
334 }
335
336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
337 {
338 size_t rc = FromWChar(outBuff, outLen, inBuff);
339 if ( rc != wxCONV_FAILED )
340 {
341 rc -= GetMBNulLen();
342 }
343
344 return rc;
345 }
346
347 wxMBConv::~wxMBConv()
348 {
349 // nothing to do here (necessary for Darwin linking probably)
350 }
351
352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
353 {
354 if ( psz )
355 {
356 // calculate the length of the buffer needed first
357 const size_t nLen = ToWChar(NULL, 0, psz);
358 if ( nLen != wxCONV_FAILED )
359 {
360 // now do the actual conversion
361 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
362
363 // +1 for the trailing NULL
364 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
365 return buf;
366 }
367 }
368
369 return wxWCharBuffer();
370 }
371
372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
373 {
374 if ( pwz )
375 {
376 const size_t nLen = FromWChar(NULL, 0, pwz);
377 if ( nLen != wxCONV_FAILED )
378 {
379 wxCharBuffer buf(nLen - 1);
380 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
381 return buf;
382 }
383 }
384
385 return wxCharBuffer();
386 }
387
388 const wxWCharBuffer
389 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
390 {
391 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
392 if ( dstLen != wxCONV_FAILED )
393 {
394 // notice that we allocate space for dstLen+1 wide characters here
395 // because we want the buffer to always be NUL-terminated, even if the
396 // input isn't (as otherwise the caller has no way to know its length)
397 wxWCharBuffer wbuf(dstLen);
398 wbuf.data()[dstLen - 1] = L'\0';
399 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
400 {
401 if ( outLen )
402 {
403 *outLen = dstLen;
404 if ( wbuf[dstLen - 1] == L'\0' )
405 (*outLen)--;
406 }
407
408 return wbuf;
409 }
410 }
411
412 if ( outLen )
413 *outLen = 0;
414
415 return wxWCharBuffer();
416 }
417
418 const wxCharBuffer
419 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
420 {
421 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
422 if ( dstLen != wxCONV_FAILED )
423 {
424 const size_t nulLen = GetMBNulLen();
425
426 // as above, ensure that the buffer is always NUL-terminated, even if
427 // the input is not
428 wxCharBuffer buf(dstLen + nulLen - 1);
429 memset(buf.data() + dstLen, 0, nulLen);
430 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
431 {
432 if ( outLen )
433 {
434 *outLen = dstLen;
435
436 if ( dstLen >= nulLen &&
437 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
438 {
439 // in this case the output is NUL-terminated and we're not
440 // supposed to count NUL
441 *outLen -= nulLen;
442 }
443 }
444
445 return buf;
446 }
447 }
448
449 if ( outLen )
450 *outLen = 0;
451
452 return wxCharBuffer();
453 }
454
455 // ----------------------------------------------------------------------------
456 // wxMBConvLibc
457 // ----------------------------------------------------------------------------
458
459 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
460 {
461 return wxMB2WC(buf, psz, n);
462 }
463
464 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
465 {
466 return wxWC2MB(buf, psz, n);
467 }
468
469 // ----------------------------------------------------------------------------
470 // wxConvBrokenFileNames
471 // ----------------------------------------------------------------------------
472
473 #ifdef __UNIX__
474
475 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
476 {
477 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
478 wxStricmp(charset, _T("UTF8")) == 0 )
479 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
480 else
481 m_conv = new wxCSConv(charset);
482 }
483
484 #endif // __UNIX__
485
486 // ----------------------------------------------------------------------------
487 // UTF-7
488 // ----------------------------------------------------------------------------
489
490 // Implementation (C) 2004 Fredrik Roubert
491
492 //
493 // BASE64 decoding table
494 //
495 static const unsigned char utf7unb64[] =
496 {
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
503 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
504 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
506 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
507 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
508 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
510 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
511 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
512 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
529 };
530
531 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
532 {
533 size_t len = 0;
534
535 while ( *psz && (!buf || (len < n)) )
536 {
537 unsigned char cc = *psz++;
538 if (cc != '+')
539 {
540 // plain ASCII char
541 if (buf)
542 *buf++ = cc;
543 len++;
544 }
545 else if (*psz == '-')
546 {
547 // encoded plus sign
548 if (buf)
549 *buf++ = cc;
550 len++;
551 psz++;
552 }
553 else // start of BASE64 encoded string
554 {
555 bool lsb, ok;
556 unsigned int d, l;
557 for ( ok = lsb = false, d = 0, l = 0;
558 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
559 psz++ )
560 {
561 d <<= 6;
562 d += cc;
563 for (l += 6; l >= 8; lsb = !lsb)
564 {
565 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
566 if (lsb)
567 {
568 if (buf)
569 *buf++ |= c;
570 len ++;
571 }
572 else
573 {
574 if (buf)
575 *buf = (wchar_t)(c << 8);
576 }
577
578 ok = true;
579 }
580 }
581
582 if ( !ok )
583 {
584 // in valid UTF7 we should have valid characters after '+'
585 return wxCONV_FAILED;
586 }
587
588 if (*psz == '-')
589 psz++;
590 }
591 }
592
593 if ( buf && (len < n) )
594 *buf = '\0';
595
596 return len;
597 }
598
599 //
600 // BASE64 encoding table
601 //
602 static const unsigned char utf7enb64[] =
603 {
604 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
605 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
606 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
607 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
608 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
609 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
610 'w', 'x', 'y', 'z', '0', '1', '2', '3',
611 '4', '5', '6', '7', '8', '9', '+', '/'
612 };
613
614 //
615 // UTF-7 encoding table
616 //
617 // 0 - Set D (directly encoded characters)
618 // 1 - Set O (optional direct characters)
619 // 2 - whitespace characters (optional)
620 // 3 - special characters
621 //
622 static const unsigned char utf7encode[128] =
623 {
624 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
625 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
626 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
628 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
630 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
632 };
633
634 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
635 {
636 size_t len = 0;
637
638 while (*psz && ((!buf) || (len < n)))
639 {
640 wchar_t cc = *psz++;
641 if (cc < 0x80 && utf7encode[cc] < 1)
642 {
643 // plain ASCII char
644 if (buf)
645 *buf++ = (char)cc;
646
647 len++;
648 }
649 #ifndef WC_UTF16
650 else if (((wxUint32)cc) > 0xffff)
651 {
652 // no surrogate pair generation (yet?)
653 return wxCONV_FAILED;
654 }
655 #endif
656 else
657 {
658 if (buf)
659 *buf++ = '+';
660
661 len++;
662 if (cc != '+')
663 {
664 // BASE64 encode string
665 unsigned int lsb, d, l;
666 for (d = 0, l = 0; /*nothing*/; psz++)
667 {
668 for (lsb = 0; lsb < 2; lsb ++)
669 {
670 d <<= 8;
671 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
672
673 for (l += 8; l >= 6; )
674 {
675 l -= 6;
676 if (buf)
677 *buf++ = utf7enb64[(d >> l) % 64];
678 len++;
679 }
680 }
681
682 cc = *psz;
683 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
684 break;
685 }
686
687 if (l != 0)
688 {
689 if (buf)
690 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
691
692 len++;
693 }
694 }
695
696 if (buf)
697 *buf++ = '-';
698 len++;
699 }
700 }
701
702 if (buf && (len < n))
703 *buf = 0;
704
705 return len;
706 }
707
708 // ----------------------------------------------------------------------------
709 // UTF-8
710 // ----------------------------------------------------------------------------
711
712 static const wxUint32 utf8_max[]=
713 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
714
715 // boundaries of the private use area we use to (temporarily) remap invalid
716 // characters invalid in a UTF-8 encoded string
717 const wxUint32 wxUnicodePUA = 0x100000;
718 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
719
720 // this table gives the length of the UTF-8 encoding from its first character:
721 const unsigned char tableUtf8Lengths[256] = {
722 // single-byte sequences (ASCII):
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
726 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
727 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
728 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
729 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
730 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
731
732 // these are invalid:
733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
735 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
736 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
737 0, 0, // C0,C1
738
739 // two-byte sequences:
740 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
741 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
742
743 // three-byte sequences:
744 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
745
746 // four-byte sequences:
747 4, 4, 4, 4, 4, // F0..F4
748
749 // these are invalid again (5- or 6-byte
750 // sequences and sequences for code points
751 // above U+10FFFF, as restricted by RFC 3629):
752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
753 };
754
755 size_t
756 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
757 const char *src, size_t srcLen) const
758 {
759 wchar_t *out = dstLen ? dst : NULL;
760 size_t written = 0;
761
762 if ( srcLen == wxNO_LEN )
763 srcLen = strlen(src) + 1;
764
765 for ( const char *p = src; ; p++ )
766 {
767 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
768 {
769 // all done successfully, just add the trailing NULL if we are not
770 // using explicit length
771 if ( srcLen == wxNO_LEN )
772 {
773 if ( out )
774 {
775 if ( !dstLen )
776 break;
777
778 *out = L'\0';
779 }
780
781 written++;
782 }
783
784 return written;
785 }
786
787 if ( out && !dstLen-- )
788 break;
789
790 wxUint32 code;
791 unsigned char c = *p;
792
793 if ( c < 0x80 )
794 {
795 if ( srcLen == 0 ) // the test works for wxNO_LEN too
796 break;
797
798 if ( srcLen != wxNO_LEN )
799 srcLen--;
800
801 code = c;
802 }
803 else
804 {
805 unsigned len = tableUtf8Lengths[c];
806 if ( !len )
807 break;
808
809 if ( srcLen < len ) // the test works for wxNO_LEN too
810 break;
811
812 if ( srcLen != wxNO_LEN )
813 srcLen -= len;
814
815 // Char. number range | UTF-8 octet sequence
816 // (hexadecimal) | (binary)
817 // ----------------------+----------------------------------------
818 // 0000 0000 - 0000 007F | 0xxxxxxx
819 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
820 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
821 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
822 //
823 // Code point value is stored in bits marked with 'x',
824 // lowest-order bit of the value on the right side in the diagram
825 // above. (from RFC 3629)
826
827 // mask to extract lead byte's value ('x' bits above), by sequence
828 // length:
829 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
830
831 // mask and value of lead byte's most significant bits, by length:
832 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
833 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
834
835 len--; // it's more convenient to work with 0-based length here
836
837 // extract the lead byte's value bits:
838 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
839 break;
840
841 code = c & leadValueMask[len];
842
843 // all remaining bytes, if any, are handled in the same way
844 // regardless of sequence's length:
845 for ( ; len; --len )
846 {
847 c = *++p;
848 if ( (c & 0xC0) != 0x80 )
849 return wxCONV_FAILED;
850
851 code <<= 6;
852 code |= c & 0x3F;
853 }
854 }
855
856 #ifdef WC_UTF16
857 // cast is ok because wchar_t == wxUint16 if WC_UTF16
858 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
859 {
860 if ( out )
861 out++;
862 written++;
863 }
864 #else // !WC_UTF16
865 if ( out )
866 *out = code;
867 #endif // WC_UTF16/!WC_UTF16
868
869 if ( out )
870 out++;
871
872 written++;
873 }
874
875 return wxCONV_FAILED;
876 }
877
878 size_t
879 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
880 const wchar_t *src, size_t srcLen) const
881 {
882 char *out = dstLen ? dst : NULL;
883 size_t written = 0;
884
885 for ( const wchar_t *wp = src; ; wp++ )
886 {
887 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
888 {
889 // all done successfully, just add the trailing NULL if we are not
890 // using explicit length
891 if ( srcLen == wxNO_LEN )
892 {
893 if ( out )
894 {
895 if ( !dstLen )
896 break;
897
898 *out = '\0';
899 }
900
901 written++;
902 }
903
904 return written;
905 }
906
907
908 wxUint32 code;
909 #ifdef WC_UTF16
910 // cast is ok for WC_UTF16
911 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
912 {
913 // skip the next char too as we decoded a surrogate
914 wp++;
915 }
916 #else // wchar_t is UTF-32
917 code = *wp & 0x7fffffff;
918 #endif
919
920 unsigned len;
921 if ( code <= 0x7F )
922 {
923 len = 1;
924 if ( out )
925 {
926 if ( dstLen < len )
927 break;
928
929 out[0] = (char)code;
930 }
931 }
932 else if ( code <= 0x07FF )
933 {
934 len = 2;
935 if ( out )
936 {
937 if ( dstLen < len )
938 break;
939
940 // NB: this line takes 6 least significant bits, encodes them as
941 // 10xxxxxx and discards them so that the next byte can be encoded:
942 out[1] = 0x80 | (code & 0x3F); code >>= 6;
943 out[0] = 0xC0 | code;
944 }
945 }
946 else if ( code < 0xFFFF )
947 {
948 len = 3;
949 if ( out )
950 {
951 if ( dstLen < len )
952 break;
953
954 out[2] = 0x80 | (code & 0x3F); code >>= 6;
955 out[1] = 0x80 | (code & 0x3F); code >>= 6;
956 out[0] = 0xE0 | code;
957 }
958 }
959 else if ( code <= 0x10FFFF )
960 {
961 len = 4;
962 if ( out )
963 {
964 if ( dstLen < len )
965 break;
966
967 out[3] = 0x80 | (code & 0x3F); code >>= 6;
968 out[2] = 0x80 | (code & 0x3F); code >>= 6;
969 out[1] = 0x80 | (code & 0x3F); code >>= 6;
970 out[0] = 0xF0 | code;
971 }
972 }
973 else
974 {
975 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
976 break;
977 }
978
979 if ( out )
980 {
981 out += len;
982 dstLen -= len;
983 }
984
985 written += len;
986 }
987
988 // we only get here if an error occurs during decoding
989 return wxCONV_FAILED;
990 }
991
992 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
993 const char *psz, size_t srcLen) const
994 {
995 if ( m_options == MAP_INVALID_UTF8_NOT )
996 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
997
998 size_t len = 0;
999
1000 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1001 {
1002 const char *opsz = psz;
1003 bool invalid = false;
1004 unsigned char cc = *psz++, fc = cc;
1005 unsigned cnt;
1006 for (cnt = 0; fc & 0x80; cnt++)
1007 fc <<= 1;
1008
1009 if (!cnt)
1010 {
1011 // plain ASCII char
1012 if (buf)
1013 *buf++ = cc;
1014 len++;
1015
1016 // escape the escape character for octal escapes
1017 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1018 && cc == '\\' && (!buf || len < n))
1019 {
1020 if (buf)
1021 *buf++ = cc;
1022 len++;
1023 }
1024 }
1025 else
1026 {
1027 cnt--;
1028 if (!cnt)
1029 {
1030 // invalid UTF-8 sequence
1031 invalid = true;
1032 }
1033 else
1034 {
1035 unsigned ocnt = cnt - 1;
1036 wxUint32 res = cc & (0x3f >> cnt);
1037 while (cnt--)
1038 {
1039 cc = *psz;
1040 if ((cc & 0xC0) != 0x80)
1041 {
1042 // invalid UTF-8 sequence
1043 invalid = true;
1044 break;
1045 }
1046
1047 psz++;
1048 res = (res << 6) | (cc & 0x3f);
1049 }
1050
1051 if (invalid || res <= utf8_max[ocnt])
1052 {
1053 // illegal UTF-8 encoding
1054 invalid = true;
1055 }
1056 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1057 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1058 {
1059 // if one of our PUA characters turns up externally
1060 // it must also be treated as an illegal sequence
1061 // (a bit like you have to escape an escape character)
1062 invalid = true;
1063 }
1064 else
1065 {
1066 #ifdef WC_UTF16
1067 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1068 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1069 if (pa == wxCONV_FAILED)
1070 {
1071 invalid = true;
1072 }
1073 else
1074 {
1075 if (buf)
1076 buf += pa;
1077 len += pa;
1078 }
1079 #else // !WC_UTF16
1080 if (buf)
1081 *buf++ = (wchar_t)res;
1082 len++;
1083 #endif // WC_UTF16/!WC_UTF16
1084 }
1085 }
1086
1087 if (invalid)
1088 {
1089 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1090 {
1091 while (opsz < psz && (!buf || len < n))
1092 {
1093 #ifdef WC_UTF16
1094 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1095 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1096 wxASSERT(pa != wxCONV_FAILED);
1097 if (buf)
1098 buf += pa;
1099 opsz++;
1100 len += pa;
1101 #else
1102 if (buf)
1103 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1104 opsz++;
1105 len++;
1106 #endif
1107 }
1108 }
1109 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1110 {
1111 while (opsz < psz && (!buf || len < n))
1112 {
1113 if ( buf && len + 3 < n )
1114 {
1115 unsigned char on = *opsz;
1116 *buf++ = L'\\';
1117 *buf++ = (wchar_t)( L'0' + on / 0100 );
1118 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1119 *buf++ = (wchar_t)( L'0' + on % 010 );
1120 }
1121
1122 opsz++;
1123 len += 4;
1124 }
1125 }
1126 else // MAP_INVALID_UTF8_NOT
1127 {
1128 return wxCONV_FAILED;
1129 }
1130 }
1131 }
1132 }
1133
1134 if (srcLen == wxNO_LEN && buf && (len < n))
1135 *buf = 0;
1136
1137 return len + 1;
1138 }
1139
1140 static inline bool isoctal(wchar_t wch)
1141 {
1142 return L'0' <= wch && wch <= L'7';
1143 }
1144
1145 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1146 const wchar_t *psz, size_t srcLen) const
1147 {
1148 if ( m_options == MAP_INVALID_UTF8_NOT )
1149 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1150
1151 size_t len = 0;
1152
1153 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1154 {
1155 wxUint32 cc;
1156
1157 #ifdef WC_UTF16
1158 // cast is ok for WC_UTF16
1159 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1160 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1161 #else
1162 cc = (*psz++) & 0x7fffffff;
1163 #endif
1164
1165 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1166 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1167 {
1168 if (buf)
1169 *buf++ = (char)(cc - wxUnicodePUA);
1170 len++;
1171 }
1172 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1173 && cc == L'\\' && psz[0] == L'\\' )
1174 {
1175 if (buf)
1176 *buf++ = (char)cc;
1177 psz++;
1178 len++;
1179 }
1180 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1181 cc == L'\\' &&
1182 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1183 {
1184 if (buf)
1185 {
1186 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1187 (psz[1] - L'0') * 010 +
1188 (psz[2] - L'0'));
1189 }
1190
1191 psz += 3;
1192 len++;
1193 }
1194 else
1195 {
1196 unsigned cnt;
1197 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1198 {
1199 }
1200
1201 if (!cnt)
1202 {
1203 // plain ASCII char
1204 if (buf)
1205 *buf++ = (char) cc;
1206 len++;
1207 }
1208 else
1209 {
1210 len += cnt + 1;
1211 if (buf)
1212 {
1213 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1214 while (cnt--)
1215 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1216 }
1217 }
1218 }
1219 }
1220
1221 if (srcLen == wxNO_LEN && buf && (len < n))
1222 *buf = 0;
1223
1224 return len + 1;
1225 }
1226
1227 // ============================================================================
1228 // UTF-16
1229 // ============================================================================
1230
1231 #ifdef WORDS_BIGENDIAN
1232 #define wxMBConvUTF16straight wxMBConvUTF16BE
1233 #define wxMBConvUTF16swap wxMBConvUTF16LE
1234 #else
1235 #define wxMBConvUTF16swap wxMBConvUTF16BE
1236 #define wxMBConvUTF16straight wxMBConvUTF16LE
1237 #endif
1238
1239 /* static */
1240 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1241 {
1242 if ( srcLen == wxNO_LEN )
1243 {
1244 // count the number of bytes in input, including the trailing NULs
1245 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1246 for ( srcLen = 1; *inBuff++; srcLen++ )
1247 ;
1248
1249 srcLen *= BYTES_PER_CHAR;
1250 }
1251 else // we already have the length
1252 {
1253 // we can only convert an entire number of UTF-16 characters
1254 if ( srcLen % BYTES_PER_CHAR )
1255 return wxCONV_FAILED;
1256 }
1257
1258 return srcLen;
1259 }
1260
1261 // case when in-memory representation is UTF-16 too
1262 #ifdef WC_UTF16
1263
1264 // ----------------------------------------------------------------------------
1265 // conversions without endianness change
1266 // ----------------------------------------------------------------------------
1267
1268 size_t
1269 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1270 const char *src, size_t srcLen) const
1271 {
1272 // set up the scene for using memcpy() (which is presumably more efficient
1273 // than copying the bytes one by one)
1274 srcLen = GetLength(src, srcLen);
1275 if ( srcLen == wxNO_LEN )
1276 return wxCONV_FAILED;
1277
1278 const size_t inLen = srcLen / BYTES_PER_CHAR;
1279 if ( dst )
1280 {
1281 if ( dstLen < inLen )
1282 return wxCONV_FAILED;
1283
1284 memcpy(dst, src, srcLen);
1285 }
1286
1287 return inLen;
1288 }
1289
1290 size_t
1291 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1292 const wchar_t *src, size_t srcLen) const
1293 {
1294 if ( srcLen == wxNO_LEN )
1295 srcLen = wxWcslen(src) + 1;
1296
1297 srcLen *= BYTES_PER_CHAR;
1298
1299 if ( dst )
1300 {
1301 if ( dstLen < srcLen )
1302 return wxCONV_FAILED;
1303
1304 memcpy(dst, src, srcLen);
1305 }
1306
1307 return srcLen;
1308 }
1309
1310 // ----------------------------------------------------------------------------
1311 // endian-reversing conversions
1312 // ----------------------------------------------------------------------------
1313
1314 size_t
1315 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1316 const char *src, size_t srcLen) const
1317 {
1318 srcLen = GetLength(src, srcLen);
1319 if ( srcLen == wxNO_LEN )
1320 return wxCONV_FAILED;
1321
1322 srcLen /= BYTES_PER_CHAR;
1323
1324 if ( dst )
1325 {
1326 if ( dstLen < srcLen )
1327 return wxCONV_FAILED;
1328
1329 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1330 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1331 {
1332 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1333 }
1334 }
1335
1336 return srcLen;
1337 }
1338
1339 size_t
1340 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1341 const wchar_t *src, size_t srcLen) const
1342 {
1343 if ( srcLen == wxNO_LEN )
1344 srcLen = wxWcslen(src) + 1;
1345
1346 srcLen *= BYTES_PER_CHAR;
1347
1348 if ( dst )
1349 {
1350 if ( dstLen < srcLen )
1351 return wxCONV_FAILED;
1352
1353 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1354 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1355 {
1356 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1357 }
1358 }
1359
1360 return srcLen;
1361 }
1362
1363 #else // !WC_UTF16: wchar_t is UTF-32
1364
1365 // ----------------------------------------------------------------------------
1366 // conversions without endianness change
1367 // ----------------------------------------------------------------------------
1368
1369 size_t
1370 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1371 const char *src, size_t srcLen) const
1372 {
1373 srcLen = GetLength(src, srcLen);
1374 if ( srcLen == wxNO_LEN )
1375 return wxCONV_FAILED;
1376
1377 const size_t inLen = srcLen / BYTES_PER_CHAR;
1378 if ( !dst )
1379 {
1380 // optimization: return maximal space which could be needed for this
1381 // string even if the real size could be smaller if the buffer contains
1382 // any surrogates
1383 return inLen;
1384 }
1385
1386 size_t outLen = 0;
1387 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1388 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1389 {
1390 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1391 if ( !inBuff )
1392 return wxCONV_FAILED;
1393
1394 if ( ++outLen > dstLen )
1395 return wxCONV_FAILED;
1396
1397 *dst++ = ch;
1398 }
1399
1400
1401 return outLen;
1402 }
1403
1404 size_t
1405 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1406 const wchar_t *src, size_t srcLen) const
1407 {
1408 if ( srcLen == wxNO_LEN )
1409 srcLen = wxWcslen(src) + 1;
1410
1411 size_t outLen = 0;
1412 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1413 for ( size_t n = 0; n < srcLen; n++ )
1414 {
1415 wxUint16 cc[2];
1416 const size_t numChars = encode_utf16(*src++, cc);
1417 if ( numChars == wxCONV_FAILED )
1418 return wxCONV_FAILED;
1419
1420 outLen += numChars * BYTES_PER_CHAR;
1421 if ( outBuff )
1422 {
1423 if ( outLen > dstLen )
1424 return wxCONV_FAILED;
1425
1426 *outBuff++ = cc[0];
1427 if ( numChars == 2 )
1428 {
1429 // second character of a surrogate
1430 *outBuff++ = cc[1];
1431 }
1432 }
1433 }
1434
1435 return outLen;
1436 }
1437
1438 // ----------------------------------------------------------------------------
1439 // endian-reversing conversions
1440 // ----------------------------------------------------------------------------
1441
1442 size_t
1443 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1444 const char *src, size_t srcLen) const
1445 {
1446 srcLen = GetLength(src, srcLen);
1447 if ( srcLen == wxNO_LEN )
1448 return wxCONV_FAILED;
1449
1450 const size_t inLen = srcLen / BYTES_PER_CHAR;
1451 if ( !dst )
1452 {
1453 // optimization: return maximal space which could be needed for this
1454 // string even if the real size could be smaller if the buffer contains
1455 // any surrogates
1456 return inLen;
1457 }
1458
1459 size_t outLen = 0;
1460 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1461 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1462 {
1463 wxUint32 ch;
1464 wxUint16 tmp[2];
1465
1466 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1467 inBuff++;
1468 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1469
1470 const size_t numChars = decode_utf16(tmp, ch);
1471 if ( numChars == wxCONV_FAILED )
1472 return wxCONV_FAILED;
1473
1474 if ( numChars == 2 )
1475 inBuff++;
1476
1477 if ( ++outLen > dstLen )
1478 return wxCONV_FAILED;
1479
1480 *dst++ = ch;
1481 }
1482
1483
1484 return outLen;
1485 }
1486
1487 size_t
1488 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1489 const wchar_t *src, size_t srcLen) const
1490 {
1491 if ( srcLen == wxNO_LEN )
1492 srcLen = wxWcslen(src) + 1;
1493
1494 size_t outLen = 0;
1495 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1496 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1497 {
1498 wxUint16 cc[2];
1499 const size_t numChars = encode_utf16(*src, cc);
1500 if ( numChars == wxCONV_FAILED )
1501 return wxCONV_FAILED;
1502
1503 outLen += numChars * BYTES_PER_CHAR;
1504 if ( outBuff )
1505 {
1506 if ( outLen > dstLen )
1507 return wxCONV_FAILED;
1508
1509 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1510 if ( numChars == 2 )
1511 {
1512 // second character of a surrogate
1513 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1514 }
1515 }
1516 }
1517
1518 return outLen;
1519 }
1520
1521 #endif // WC_UTF16/!WC_UTF16
1522
1523
1524 // ============================================================================
1525 // UTF-32
1526 // ============================================================================
1527
1528 #ifdef WORDS_BIGENDIAN
1529 #define wxMBConvUTF32straight wxMBConvUTF32BE
1530 #define wxMBConvUTF32swap wxMBConvUTF32LE
1531 #else
1532 #define wxMBConvUTF32swap wxMBConvUTF32BE
1533 #define wxMBConvUTF32straight wxMBConvUTF32LE
1534 #endif
1535
1536
1537 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1538 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1539
1540 /* static */
1541 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1542 {
1543 if ( srcLen == wxNO_LEN )
1544 {
1545 // count the number of bytes in input, including the trailing NULs
1546 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1547 for ( srcLen = 1; *inBuff++; srcLen++ )
1548 ;
1549
1550 srcLen *= BYTES_PER_CHAR;
1551 }
1552 else // we already have the length
1553 {
1554 // we can only convert an entire number of UTF-32 characters
1555 if ( srcLen % BYTES_PER_CHAR )
1556 return wxCONV_FAILED;
1557 }
1558
1559 return srcLen;
1560 }
1561
1562 // case when in-memory representation is UTF-16
1563 #ifdef WC_UTF16
1564
1565 // ----------------------------------------------------------------------------
1566 // conversions without endianness change
1567 // ----------------------------------------------------------------------------
1568
1569 size_t
1570 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1571 const char *src, size_t srcLen) const
1572 {
1573 srcLen = GetLength(src, srcLen);
1574 if ( srcLen == wxNO_LEN )
1575 return wxCONV_FAILED;
1576
1577 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1578 const size_t inLen = srcLen / BYTES_PER_CHAR;
1579 size_t outLen = 0;
1580 for ( size_t n = 0; n < inLen; n++ )
1581 {
1582 wxUint16 cc[2];
1583 const size_t numChars = encode_utf16(*inBuff++, cc);
1584 if ( numChars == wxCONV_FAILED )
1585 return wxCONV_FAILED;
1586
1587 outLen += numChars;
1588 if ( dst )
1589 {
1590 if ( outLen > dstLen )
1591 return wxCONV_FAILED;
1592
1593 *dst++ = cc[0];
1594 if ( numChars == 2 )
1595 {
1596 // second character of a surrogate
1597 *dst++ = cc[1];
1598 }
1599 }
1600 }
1601
1602 return outLen;
1603 }
1604
1605 size_t
1606 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1607 const wchar_t *src, size_t srcLen) const
1608 {
1609 if ( srcLen == wxNO_LEN )
1610 srcLen = wxWcslen(src) + 1;
1611
1612 if ( !dst )
1613 {
1614 // optimization: return maximal space which could be needed for this
1615 // string instead of the exact amount which could be less if there are
1616 // any surrogates in the input
1617 //
1618 // we consider that surrogates are rare enough to make it worthwhile to
1619 // avoid running the loop below at the cost of slightly extra memory
1620 // consumption
1621 return srcLen * BYTES_PER_CHAR;
1622 }
1623
1624 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1625 size_t outLen = 0;
1626 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1627 {
1628 const wxUint32 ch = wxDecodeSurrogate(&src);
1629 if ( !src )
1630 return wxCONV_FAILED;
1631
1632 outLen += BYTES_PER_CHAR;
1633
1634 if ( outLen > dstLen )
1635 return wxCONV_FAILED;
1636
1637 *outBuff++ = ch;
1638 }
1639
1640 return outLen;
1641 }
1642
1643 // ----------------------------------------------------------------------------
1644 // endian-reversing conversions
1645 // ----------------------------------------------------------------------------
1646
1647 size_t
1648 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1649 const char *src, size_t srcLen) const
1650 {
1651 srcLen = GetLength(src, srcLen);
1652 if ( srcLen == wxNO_LEN )
1653 return wxCONV_FAILED;
1654
1655 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1656 const size_t inLen = srcLen / BYTES_PER_CHAR;
1657 size_t outLen = 0;
1658 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1659 {
1660 wxUint16 cc[2];
1661 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1662 if ( numChars == wxCONV_FAILED )
1663 return wxCONV_FAILED;
1664
1665 outLen += numChars;
1666 if ( dst )
1667 {
1668 if ( outLen > dstLen )
1669 return wxCONV_FAILED;
1670
1671 *dst++ = cc[0];
1672 if ( numChars == 2 )
1673 {
1674 // second character of a surrogate
1675 *dst++ = cc[1];
1676 }
1677 }
1678 }
1679
1680 return outLen;
1681 }
1682
1683 size_t
1684 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1685 const wchar_t *src, size_t srcLen) const
1686 {
1687 if ( srcLen == wxNO_LEN )
1688 srcLen = wxWcslen(src) + 1;
1689
1690 if ( !dst )
1691 {
1692 // optimization: return maximal space which could be needed for this
1693 // string instead of the exact amount which could be less if there are
1694 // any surrogates in the input
1695 //
1696 // we consider that surrogates are rare enough to make it worthwhile to
1697 // avoid running the loop below at the cost of slightly extra memory
1698 // consumption
1699 return srcLen*BYTES_PER_CHAR;
1700 }
1701
1702 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1703 size_t outLen = 0;
1704 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1705 {
1706 const wxUint32 ch = wxDecodeSurrogate(&src);
1707 if ( !src )
1708 return wxCONV_FAILED;
1709
1710 outLen += BYTES_PER_CHAR;
1711
1712 if ( outLen > dstLen )
1713 return wxCONV_FAILED;
1714
1715 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1716 }
1717
1718 return outLen;
1719 }
1720
1721 #else // !WC_UTF16: wchar_t is UTF-32
1722
1723 // ----------------------------------------------------------------------------
1724 // conversions without endianness change
1725 // ----------------------------------------------------------------------------
1726
1727 size_t
1728 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1729 const char *src, size_t srcLen) const
1730 {
1731 // use memcpy() as it should be much faster than hand-written loop
1732 srcLen = GetLength(src, srcLen);
1733 if ( srcLen == wxNO_LEN )
1734 return wxCONV_FAILED;
1735
1736 const size_t inLen = srcLen/BYTES_PER_CHAR;
1737 if ( dst )
1738 {
1739 if ( dstLen < inLen )
1740 return wxCONV_FAILED;
1741
1742 memcpy(dst, src, srcLen);
1743 }
1744
1745 return inLen;
1746 }
1747
1748 size_t
1749 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1750 const wchar_t *src, size_t srcLen) const
1751 {
1752 if ( srcLen == wxNO_LEN )
1753 srcLen = wxWcslen(src) + 1;
1754
1755 srcLen *= BYTES_PER_CHAR;
1756
1757 if ( dst )
1758 {
1759 if ( dstLen < srcLen )
1760 return wxCONV_FAILED;
1761
1762 memcpy(dst, src, srcLen);
1763 }
1764
1765 return srcLen;
1766 }
1767
1768 // ----------------------------------------------------------------------------
1769 // endian-reversing conversions
1770 // ----------------------------------------------------------------------------
1771
1772 size_t
1773 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1774 const char *src, size_t srcLen) const
1775 {
1776 srcLen = GetLength(src, srcLen);
1777 if ( srcLen == wxNO_LEN )
1778 return wxCONV_FAILED;
1779
1780 srcLen /= BYTES_PER_CHAR;
1781
1782 if ( dst )
1783 {
1784 if ( dstLen < srcLen )
1785 return wxCONV_FAILED;
1786
1787 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1788 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1789 {
1790 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1791 }
1792 }
1793
1794 return srcLen;
1795 }
1796
1797 size_t
1798 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1799 const wchar_t *src, size_t srcLen) const
1800 {
1801 if ( srcLen == wxNO_LEN )
1802 srcLen = wxWcslen(src) + 1;
1803
1804 srcLen *= BYTES_PER_CHAR;
1805
1806 if ( dst )
1807 {
1808 if ( dstLen < srcLen )
1809 return wxCONV_FAILED;
1810
1811 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1812 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1813 {
1814 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1815 }
1816 }
1817
1818 return srcLen;
1819 }
1820
1821 #endif // WC_UTF16/!WC_UTF16
1822
1823
1824 // ============================================================================
1825 // The classes doing conversion using the iconv_xxx() functions
1826 // ============================================================================
1827
1828 #ifdef HAVE_ICONV
1829
1830 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1831 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1832 // (unless there's yet another bug in glibc) the only case when iconv()
1833 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1834 // left in the input buffer -- when _real_ error occurs,
1835 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1836 // iconv() failure.
1837 // [This bug does not appear in glibc 2.2.]
1838 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1839 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1840 (errno != E2BIG || bufLeft != 0))
1841 #else
1842 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1843 #endif
1844
1845 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1846
1847 #define ICONV_T_INVALID ((iconv_t)-1)
1848
1849 #if SIZEOF_WCHAR_T == 4
1850 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1851 #define WC_ENC wxFONTENCODING_UTF32
1852 #elif SIZEOF_WCHAR_T == 2
1853 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1854 #define WC_ENC wxFONTENCODING_UTF16
1855 #else // sizeof(wchar_t) != 2 nor 4
1856 // does this ever happen?
1857 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1858 #endif
1859
1860 // ----------------------------------------------------------------------------
1861 // wxMBConv_iconv: encapsulates an iconv character set
1862 // ----------------------------------------------------------------------------
1863
1864 class wxMBConv_iconv : public wxMBConv
1865 {
1866 public:
1867 wxMBConv_iconv(const char *name);
1868 virtual ~wxMBConv_iconv();
1869
1870 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1871 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1872
1873 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1874 virtual size_t GetMBNulLen() const;
1875
1876 #if wxUSE_UNICODE_UTF8
1877 virtual bool IsUTF8() const;
1878 #endif
1879
1880 virtual wxMBConv *Clone() const
1881 {
1882 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1883 p->m_minMBCharWidth = m_minMBCharWidth;
1884 return p;
1885 }
1886
1887 bool IsOk() const
1888 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1889
1890 protected:
1891 // the iconv handlers used to translate from multibyte
1892 // to wide char and in the other direction
1893 iconv_t m2w,
1894 w2m;
1895
1896 #if wxUSE_THREADS
1897 // guards access to m2w and w2m objects
1898 wxMutex m_iconvMutex;
1899 #endif
1900
1901 private:
1902 // the name (for iconv_open()) of a wide char charset -- if none is
1903 // available on this machine, it will remain NULL
1904 static wxString ms_wcCharsetName;
1905
1906 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1907 // different endian-ness than the native one
1908 static bool ms_wcNeedsSwap;
1909
1910
1911 // name of the encoding handled by this conversion
1912 wxString m_name;
1913
1914 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1915 // initially
1916 size_t m_minMBCharWidth;
1917 };
1918
1919 // make the constructor available for unit testing
1920 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1921 {
1922 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1923 if ( !result->IsOk() )
1924 {
1925 delete result;
1926 return 0;
1927 }
1928
1929 return result;
1930 }
1931
1932 wxString wxMBConv_iconv::ms_wcCharsetName;
1933 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1934
1935 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1936 : m_name(name)
1937 {
1938 m_minMBCharWidth = 0;
1939
1940 // check for charset that represents wchar_t:
1941 if ( ms_wcCharsetName.empty() )
1942 {
1943 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1944
1945 #if wxUSE_FONTMAP
1946 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1947 #else // !wxUSE_FONTMAP
1948 static const wxChar *names_static[] =
1949 {
1950 #if SIZEOF_WCHAR_T == 4
1951 _T("UCS-4"),
1952 #elif SIZEOF_WCHAR_T = 2
1953 _T("UCS-2"),
1954 #endif
1955 NULL
1956 };
1957 const wxChar **names = names_static;
1958 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1959
1960 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1961 {
1962 const wxString nameCS(*names);
1963
1964 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1965 wxString nameXE(nameCS);
1966
1967 #ifdef WORDS_BIGENDIAN
1968 nameXE += _T("BE");
1969 #else // little endian
1970 nameXE += _T("LE");
1971 #endif
1972
1973 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1974 nameXE.c_str());
1975
1976 m2w = iconv_open(nameXE.ToAscii(), name);
1977 if ( m2w == ICONV_T_INVALID )
1978 {
1979 // try charset w/o bytesex info (e.g. "UCS4")
1980 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1981 nameCS.c_str());
1982 m2w = iconv_open(nameCS.ToAscii(), name);
1983
1984 // and check for bytesex ourselves:
1985 if ( m2w != ICONV_T_INVALID )
1986 {
1987 char buf[2], *bufPtr;
1988 wchar_t wbuf[2], *wbufPtr;
1989 size_t insz, outsz;
1990 size_t res;
1991
1992 buf[0] = 'A';
1993 buf[1] = 0;
1994 wbuf[0] = 0;
1995 insz = 2;
1996 outsz = SIZEOF_WCHAR_T * 2;
1997 wbufPtr = wbuf;
1998 bufPtr = buf;
1999
2000 res = iconv(
2001 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2002 (char**)&wbufPtr, &outsz);
2003
2004 if (ICONV_FAILED(res, insz))
2005 {
2006 wxLogLastError(wxT("iconv"));
2007 wxLogError(_("Conversion to charset '%s' doesn't work."),
2008 nameCS.c_str());
2009 }
2010 else // ok, can convert to this encoding, remember it
2011 {
2012 ms_wcCharsetName = nameCS;
2013 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2014 }
2015 }
2016 }
2017 else // use charset not requiring byte swapping
2018 {
2019 ms_wcCharsetName = nameXE;
2020 }
2021 }
2022
2023 wxLogTrace(TRACE_STRCONV,
2024 wxT("iconv wchar_t charset is \"%s\"%s"),
2025 ms_wcCharsetName.empty() ? wxString("<none>")
2026 : ms_wcCharsetName,
2027 ms_wcNeedsSwap ? _T(" (needs swap)")
2028 : _T(""));
2029 }
2030 else // we already have ms_wcCharsetName
2031 {
2032 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2033 }
2034
2035 if ( ms_wcCharsetName.empty() )
2036 {
2037 w2m = ICONV_T_INVALID;
2038 }
2039 else
2040 {
2041 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2042 if ( w2m == ICONV_T_INVALID )
2043 {
2044 wxLogTrace(TRACE_STRCONV,
2045 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2046 ms_wcCharsetName.c_str(), name);
2047 }
2048 }
2049 }
2050
2051 wxMBConv_iconv::~wxMBConv_iconv()
2052 {
2053 if ( m2w != ICONV_T_INVALID )
2054 iconv_close(m2w);
2055 if ( w2m != ICONV_T_INVALID )
2056 iconv_close(w2m);
2057 }
2058
2059 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2060 {
2061 // find the string length: notice that must be done differently for
2062 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2063 size_t inbuf;
2064 const size_t nulLen = GetMBNulLen();
2065 switch ( nulLen )
2066 {
2067 default:
2068 return wxCONV_FAILED;
2069
2070 case 1:
2071 inbuf = strlen(psz); // arguably more optimized than our version
2072 break;
2073
2074 case 2:
2075 case 4:
2076 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2077 // they also have to start at character boundary and not span two
2078 // adjacent characters
2079 const char *p;
2080 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2081 ;
2082 inbuf = p - psz;
2083 break;
2084 }
2085
2086 #if wxUSE_THREADS
2087 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2088 // Unfortunately there are a couple of global wxCSConv objects such as
2089 // wxConvLocal that are used all over wx code, so we have to make sure
2090 // the handle is used by at most one thread at the time. Otherwise
2091 // only a few wx classes would be safe to use from non-main threads
2092 // as MB<->WC conversion would fail "randomly".
2093 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2094 #endif // wxUSE_THREADS
2095
2096 size_t outbuf = n * SIZEOF_WCHAR_T;
2097 size_t res, cres;
2098 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2099 wchar_t *bufPtr = buf;
2100 const char *pszPtr = psz;
2101
2102 if (buf)
2103 {
2104 // have destination buffer, convert there
2105 cres = iconv(m2w,
2106 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2107 (char**)&bufPtr, &outbuf);
2108 res = n - (outbuf / SIZEOF_WCHAR_T);
2109
2110 if (ms_wcNeedsSwap)
2111 {
2112 // convert to native endianness
2113 for ( unsigned i = 0; i < res; i++ )
2114 buf[n] = WC_BSWAP(buf[i]);
2115 }
2116
2117 // NUL-terminate the string if there is any space left
2118 if (res < n)
2119 buf[res] = 0;
2120 }
2121 else
2122 {
2123 // no destination buffer... convert using temp buffer
2124 // to calculate destination buffer requirement
2125 wchar_t tbuf[8];
2126 res = 0;
2127
2128 do
2129 {
2130 bufPtr = tbuf;
2131 outbuf = 8 * SIZEOF_WCHAR_T;
2132
2133 cres = iconv(m2w,
2134 ICONV_CHAR_CAST(&pszPtr), &inbuf,
2135 (char**)&bufPtr, &outbuf );
2136
2137 res += 8 - (outbuf / SIZEOF_WCHAR_T);
2138 }
2139 while ((cres == (size_t)-1) && (errno == E2BIG));
2140 }
2141
2142 if (ICONV_FAILED(cres, inbuf))
2143 {
2144 //VS: it is ok if iconv fails, hence trace only
2145 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2146 return wxCONV_FAILED;
2147 }
2148
2149 return res;
2150 }
2151
2152 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2153 {
2154 #if wxUSE_THREADS
2155 // NB: explained in MB2WC
2156 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2157 #endif
2158
2159 size_t inlen = wxWcslen(psz);
2160 size_t inbuf = inlen * SIZEOF_WCHAR_T;
2161 size_t outbuf = n;
2162 size_t res, cres;
2163
2164 wchar_t *tmpbuf = 0;
2165
2166 if (ms_wcNeedsSwap)
2167 {
2168 // need to copy to temp buffer to switch endianness
2169 // (doing WC_BSWAP twice on the original buffer won't help, as it
2170 // could be in read-only memory, or be accessed in some other thread)
2171 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2172 for ( size_t i = 0; i < inlen; i++ )
2173 tmpbuf[n] = WC_BSWAP(psz[i]);
2174
2175 tmpbuf[inlen] = L'\0';
2176 psz = tmpbuf;
2177 }
2178
2179 if (buf)
2180 {
2181 // have destination buffer, convert there
2182 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2183
2184 res = n - outbuf;
2185
2186 // NB: iconv was given only wcslen(psz) characters on input, and so
2187 // it couldn't convert the trailing zero. Let's do it ourselves
2188 // if there's some room left for it in the output buffer.
2189 if (res < n)
2190 buf[0] = 0;
2191 }
2192 else
2193 {
2194 // no destination buffer: convert using temp buffer
2195 // to calculate destination buffer requirement
2196 char tbuf[16];
2197 res = 0;
2198 do
2199 {
2200 buf = tbuf;
2201 outbuf = 16;
2202
2203 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2204
2205 res += 16 - outbuf;
2206 }
2207 while ((cres == (size_t)-1) && (errno == E2BIG));
2208 }
2209
2210 if (ms_wcNeedsSwap)
2211 {
2212 free(tmpbuf);
2213 }
2214
2215 if (ICONV_FAILED(cres, inbuf))
2216 {
2217 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2218 return wxCONV_FAILED;
2219 }
2220
2221 return res;
2222 }
2223
2224 size_t wxMBConv_iconv::GetMBNulLen() const
2225 {
2226 if ( m_minMBCharWidth == 0 )
2227 {
2228 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2229
2230 #if wxUSE_THREADS
2231 // NB: explained in MB2WC
2232 wxMutexLocker lock(self->m_iconvMutex);
2233 #endif
2234
2235 const wchar_t *wnul = L"";
2236 char buf[8]; // should be enough for NUL in any encoding
2237 size_t inLen = sizeof(wchar_t),
2238 outLen = WXSIZEOF(buf);
2239 char *inBuff = (char *)wnul;
2240 char *outBuff = buf;
2241 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2242 {
2243 self->m_minMBCharWidth = (size_t)-1;
2244 }
2245 else // ok
2246 {
2247 self->m_minMBCharWidth = outBuff - buf;
2248 }
2249 }
2250
2251 return m_minMBCharWidth;
2252 }
2253
2254 #if wxUSE_UNICODE_UTF8
2255 bool wxMBConv_iconv::IsUTF8() const
2256 {
2257 return wxStricmp(m_name, "UTF-8") == 0 ||
2258 wxStricmp(m_name, "UTF8") == 0;
2259 }
2260 #endif
2261
2262 #endif // HAVE_ICONV
2263
2264
2265 // ============================================================================
2266 // Win32 conversion classes
2267 // ============================================================================
2268
2269 #ifdef wxHAVE_WIN32_MB2WC
2270
2271 // from utils.cpp
2272 #if wxUSE_FONTMAP
2273 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2274 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2275 #endif
2276
2277 class wxMBConv_win32 : public wxMBConv
2278 {
2279 public:
2280 wxMBConv_win32()
2281 {
2282 m_CodePage = CP_ACP;
2283 m_minMBCharWidth = 0;
2284 }
2285
2286 wxMBConv_win32(const wxMBConv_win32& conv)
2287 : wxMBConv()
2288 {
2289 m_CodePage = conv.m_CodePage;
2290 m_minMBCharWidth = conv.m_minMBCharWidth;
2291 }
2292
2293 #if wxUSE_FONTMAP
2294 wxMBConv_win32(const char* name)
2295 {
2296 m_CodePage = wxCharsetToCodepage(name);
2297 m_minMBCharWidth = 0;
2298 }
2299
2300 wxMBConv_win32(wxFontEncoding encoding)
2301 {
2302 m_CodePage = wxEncodingToCodepage(encoding);
2303 m_minMBCharWidth = 0;
2304 }
2305 #endif // wxUSE_FONTMAP
2306
2307 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2308 {
2309 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2310 // the behaviour is not compatible with the Unix version (using iconv)
2311 // and break the library itself, e.g. wxTextInputStream::NextChar()
2312 // wouldn't work if reading an incomplete MB char didn't result in an
2313 // error
2314 //
2315 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2316 // Win XP or newer and it is not supported for UTF-[78] so we always
2317 // use our own conversions in this case. See
2318 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2319 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2320 if ( m_CodePage == CP_UTF8 )
2321 {
2322 return wxMBConvUTF8().MB2WC(buf, psz, n);
2323 }
2324
2325 if ( m_CodePage == CP_UTF7 )
2326 {
2327 return wxMBConvUTF7().MB2WC(buf, psz, n);
2328 }
2329
2330 int flags = 0;
2331 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2332 IsAtLeastWin2kSP4() )
2333 {
2334 flags = MB_ERR_INVALID_CHARS;
2335 }
2336
2337 const size_t len = ::MultiByteToWideChar
2338 (
2339 m_CodePage, // code page
2340 flags, // flags: fall on error
2341 psz, // input string
2342 -1, // its length (NUL-terminated)
2343 buf, // output string
2344 buf ? n : 0 // size of output buffer
2345 );
2346 if ( !len )
2347 {
2348 // function totally failed
2349 return wxCONV_FAILED;
2350 }
2351
2352 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2353 // check if we succeeded, by doing a double trip:
2354 if ( !flags && buf )
2355 {
2356 const size_t mbLen = strlen(psz);
2357 wxCharBuffer mbBuf(mbLen);
2358 if ( ::WideCharToMultiByte
2359 (
2360 m_CodePage,
2361 0,
2362 buf,
2363 -1,
2364 mbBuf.data(),
2365 mbLen + 1, // size in bytes, not length
2366 NULL,
2367 NULL
2368 ) == 0 ||
2369 strcmp(mbBuf, psz) != 0 )
2370 {
2371 // we didn't obtain the same thing we started from, hence
2372 // the conversion was lossy and we consider that it failed
2373 return wxCONV_FAILED;
2374 }
2375 }
2376
2377 // note that it returns count of written chars for buf != NULL and size
2378 // of the needed buffer for buf == NULL so in either case the length of
2379 // the string (which never includes the terminating NUL) is one less
2380 return len - 1;
2381 }
2382
2383 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2384 {
2385 /*
2386 we have a problem here: by default, WideCharToMultiByte() may
2387 replace characters unrepresentable in the target code page with bad
2388 quality approximations such as turning "1/2" symbol (U+00BD) into
2389 "1" for the code pages which don't have it and we, obviously, want
2390 to avoid this at any price
2391
2392 the trouble is that this function does it _silently_, i.e. it won't
2393 even tell us whether it did or not... Win98/2000 and higher provide
2394 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2395 we have to resort to a round trip, i.e. check that converting back
2396 results in the same string -- this is, of course, expensive but
2397 otherwise we simply can't be sure to not garble the data.
2398 */
2399
2400 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2401 // it doesn't work with CJK encodings (which we test for rather roughly
2402 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2403 // supporting it
2404 BOOL usedDef wxDUMMY_INITIALIZE(false);
2405 BOOL *pUsedDef;
2406 int flags;
2407 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2408 {
2409 // it's our lucky day
2410 flags = WC_NO_BEST_FIT_CHARS;
2411 pUsedDef = &usedDef;
2412 }
2413 else // old system or unsupported encoding
2414 {
2415 flags = 0;
2416 pUsedDef = NULL;
2417 }
2418
2419 const size_t len = ::WideCharToMultiByte
2420 (
2421 m_CodePage, // code page
2422 flags, // either none or no best fit
2423 pwz, // input string
2424 -1, // it is (wide) NUL-terminated
2425 buf, // output buffer
2426 buf ? n : 0, // and its size
2427 NULL, // default "replacement" char
2428 pUsedDef // [out] was it used?
2429 );
2430
2431 if ( !len )
2432 {
2433 // function totally failed
2434 return wxCONV_FAILED;
2435 }
2436
2437 // we did something, check if we really succeeded
2438 if ( flags )
2439 {
2440 // check if the conversion failed, i.e. if any replacements
2441 // were done
2442 if ( usedDef )
2443 return wxCONV_FAILED;
2444 }
2445 else // we must resort to double tripping...
2446 {
2447 // first we need to ensure that we really have the MB data: this is
2448 // not the case if we're called with NULL buffer, in which case we
2449 // need to do the conversion yet again
2450 wxCharBuffer bufDef;
2451 if ( !buf )
2452 {
2453 bufDef = wxCharBuffer(len);
2454 buf = bufDef.data();
2455 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2456 buf, len, NULL, NULL) )
2457 return wxCONV_FAILED;
2458 }
2459
2460 if ( !n )
2461 n = wcslen(pwz);
2462 wxWCharBuffer wcBuf(n);
2463 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2464 wcscmp(wcBuf, pwz) != 0 )
2465 {
2466 // we didn't obtain the same thing we started from, hence
2467 // the conversion was lossy and we consider that it failed
2468 return wxCONV_FAILED;
2469 }
2470 }
2471
2472 // see the comment above for the reason of "len - 1"
2473 return len - 1;
2474 }
2475
2476 virtual size_t GetMBNulLen() const
2477 {
2478 if ( m_minMBCharWidth == 0 )
2479 {
2480 int len = ::WideCharToMultiByte
2481 (
2482 m_CodePage, // code page
2483 0, // no flags
2484 L"", // input string
2485 1, // translate just the NUL
2486 NULL, // output buffer
2487 0, // and its size
2488 NULL, // no replacement char
2489 NULL // [out] don't care if it was used
2490 );
2491
2492 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2493 switch ( len )
2494 {
2495 default:
2496 wxLogDebug(_T("Unexpected NUL length %d"), len);
2497 self->m_minMBCharWidth = (size_t)-1;
2498 break;
2499
2500 case 0:
2501 self->m_minMBCharWidth = (size_t)-1;
2502 break;
2503
2504 case 1:
2505 case 2:
2506 case 4:
2507 self->m_minMBCharWidth = len;
2508 break;
2509 }
2510 }
2511
2512 return m_minMBCharWidth;
2513 }
2514
2515 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2516
2517 bool IsOk() const { return m_CodePage != -1; }
2518
2519 private:
2520 static bool CanUseNoBestFit()
2521 {
2522 static int s_isWin98Or2k = -1;
2523
2524 if ( s_isWin98Or2k == -1 )
2525 {
2526 int verMaj, verMin;
2527 switch ( wxGetOsVersion(&verMaj, &verMin) )
2528 {
2529 case wxOS_WINDOWS_9X:
2530 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2531 break;
2532
2533 case wxOS_WINDOWS_NT:
2534 s_isWin98Or2k = verMaj >= 5;
2535 break;
2536
2537 default:
2538 // unknown: be conservative by default
2539 s_isWin98Or2k = 0;
2540 break;
2541 }
2542
2543 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2544 }
2545
2546 return s_isWin98Or2k == 1;
2547 }
2548
2549 static bool IsAtLeastWin2kSP4()
2550 {
2551 #ifdef __WXWINCE__
2552 return false;
2553 #else
2554 static int s_isAtLeastWin2kSP4 = -1;
2555
2556 if ( s_isAtLeastWin2kSP4 == -1 )
2557 {
2558 OSVERSIONINFOEX ver;
2559
2560 memset(&ver, 0, sizeof(ver));
2561 ver.dwOSVersionInfoSize = sizeof(ver);
2562 GetVersionEx((OSVERSIONINFO*)&ver);
2563
2564 s_isAtLeastWin2kSP4 =
2565 ((ver.dwMajorVersion > 5) || // Vista+
2566 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2567 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2568 ver.wServicePackMajor >= 4)) // 2000 SP4+
2569 ? 1 : 0;
2570 }
2571
2572 return s_isAtLeastWin2kSP4 == 1;
2573 #endif
2574 }
2575
2576
2577 // the code page we're working with
2578 long m_CodePage;
2579
2580 // cached result of GetMBNulLen(), set to 0 initially meaning
2581 // "unknown"
2582 size_t m_minMBCharWidth;
2583 };
2584
2585 #endif // wxHAVE_WIN32_MB2WC
2586
2587
2588 // ============================================================================
2589 // wxEncodingConverter based conversion classes
2590 // ============================================================================
2591
2592 #if wxUSE_FONTMAP
2593
2594 class wxMBConv_wxwin : public wxMBConv
2595 {
2596 private:
2597 void Init()
2598 {
2599 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2600 // The wxMBConv_cf class does a better job.
2601 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2602 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2603 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2604 }
2605
2606 public:
2607 // temporarily just use wxEncodingConverter stuff,
2608 // so that it works while a better implementation is built
2609 wxMBConv_wxwin(const char* name)
2610 {
2611 if (name)
2612 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2613 else
2614 m_enc = wxFONTENCODING_SYSTEM;
2615
2616 Init();
2617 }
2618
2619 wxMBConv_wxwin(wxFontEncoding enc)
2620 {
2621 m_enc = enc;
2622
2623 Init();
2624 }
2625
2626 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2627 {
2628 size_t inbuf = strlen(psz);
2629 if (buf)
2630 {
2631 if (!m2w.Convert(psz, buf))
2632 return wxCONV_FAILED;
2633 }
2634 return inbuf;
2635 }
2636
2637 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2638 {
2639 const size_t inbuf = wxWcslen(psz);
2640 if (buf)
2641 {
2642 if (!w2m.Convert(psz, buf))
2643 return wxCONV_FAILED;
2644 }
2645
2646 return inbuf;
2647 }
2648
2649 virtual size_t GetMBNulLen() const
2650 {
2651 switch ( m_enc )
2652 {
2653 case wxFONTENCODING_UTF16BE:
2654 case wxFONTENCODING_UTF16LE:
2655 return 2;
2656
2657 case wxFONTENCODING_UTF32BE:
2658 case wxFONTENCODING_UTF32LE:
2659 return 4;
2660
2661 default:
2662 return 1;
2663 }
2664 }
2665
2666 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2667
2668 bool IsOk() const { return m_ok; }
2669
2670 public:
2671 wxFontEncoding m_enc;
2672 wxEncodingConverter m2w, w2m;
2673
2674 private:
2675 // were we initialized successfully?
2676 bool m_ok;
2677
2678 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2679 };
2680
2681 // make the constructors available for unit testing
2682 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2683 {
2684 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2685 if ( !result->IsOk() )
2686 {
2687 delete result;
2688 return 0;
2689 }
2690
2691 return result;
2692 }
2693
2694 #endif // wxUSE_FONTMAP
2695
2696 // ============================================================================
2697 // wxCSConv implementation
2698 // ============================================================================
2699
2700 void wxCSConv::Init()
2701 {
2702 m_name = NULL;
2703 m_convReal = NULL;
2704 m_deferred = true;
2705 }
2706
2707 wxCSConv::wxCSConv(const wxString& charset)
2708 {
2709 Init();
2710
2711 if ( !charset.empty() )
2712 {
2713 SetName(charset.ToAscii());
2714 }
2715
2716 #if wxUSE_FONTMAP
2717 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2718 #else
2719 m_encoding = wxFONTENCODING_SYSTEM;
2720 #endif
2721 }
2722
2723 wxCSConv::wxCSConv(wxFontEncoding encoding)
2724 {
2725 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2726 {
2727 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2728
2729 encoding = wxFONTENCODING_SYSTEM;
2730 }
2731
2732 Init();
2733
2734 m_encoding = encoding;
2735 }
2736
2737 wxCSConv::~wxCSConv()
2738 {
2739 Clear();
2740 }
2741
2742 wxCSConv::wxCSConv(const wxCSConv& conv)
2743 : wxMBConv()
2744 {
2745 Init();
2746
2747 SetName(conv.m_name);
2748 m_encoding = conv.m_encoding;
2749 }
2750
2751 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2752 {
2753 Clear();
2754
2755 SetName(conv.m_name);
2756 m_encoding = conv.m_encoding;
2757
2758 return *this;
2759 }
2760
2761 void wxCSConv::Clear()
2762 {
2763 free(m_name);
2764 delete m_convReal;
2765
2766 m_name = NULL;
2767 m_convReal = NULL;
2768 }
2769
2770 void wxCSConv::SetName(const char *charset)
2771 {
2772 if (charset)
2773 {
2774 m_name = wxStrdup(charset);
2775 m_deferred = true;
2776 }
2777 }
2778
2779 #if wxUSE_FONTMAP
2780
2781 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2782 wxEncodingNameCache );
2783
2784 static wxEncodingNameCache gs_nameCache;
2785 #endif
2786
2787 wxMBConv *wxCSConv::DoCreate() const
2788 {
2789 #if wxUSE_FONTMAP
2790 wxLogTrace(TRACE_STRCONV,
2791 wxT("creating conversion for %s"),
2792 (m_name ? m_name
2793 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2794 #endif // wxUSE_FONTMAP
2795
2796 // check for the special case of ASCII or ISO8859-1 charset: as we have
2797 // special knowledge of it anyhow, we don't need to create a special
2798 // conversion object
2799 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2800 m_encoding == wxFONTENCODING_DEFAULT )
2801 {
2802 // don't convert at all
2803 return NULL;
2804 }
2805
2806 // we trust OS to do conversion better than we can so try external
2807 // conversion methods first
2808 //
2809 // the full order is:
2810 // 1. OS conversion (iconv() under Unix or Win32 API)
2811 // 2. hard coded conversions for UTF
2812 // 3. wxEncodingConverter as fall back
2813
2814 // step (1)
2815 #ifdef HAVE_ICONV
2816 #if !wxUSE_FONTMAP
2817 if ( m_name )
2818 #endif // !wxUSE_FONTMAP
2819 {
2820 #if wxUSE_FONTMAP
2821 wxFontEncoding encoding(m_encoding);
2822 #endif
2823
2824 if ( m_name )
2825 {
2826 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2827 if ( conv->IsOk() )
2828 return conv;
2829
2830 delete conv;
2831
2832 #if wxUSE_FONTMAP
2833 encoding =
2834 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2835 #endif // wxUSE_FONTMAP
2836 }
2837 #if wxUSE_FONTMAP
2838 {
2839 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2840 if ( it != gs_nameCache.end() )
2841 {
2842 if ( it->second.empty() )
2843 return NULL;
2844
2845 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2846 if ( conv->IsOk() )
2847 return conv;
2848
2849 delete conv;
2850 }
2851
2852 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2853 // CS : in case this does not return valid names (eg for MacRoman)
2854 // encoding got a 'failure' entry in the cache all the same,
2855 // although it just has to be created using a different method, so
2856 // only store failed iconv creation attempts (or perhaps we
2857 // shoulnd't do this at all ?)
2858 if ( names[0] != NULL )
2859 {
2860 for ( ; *names; ++names )
2861 {
2862 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2863 // will need changes that will obsolete this
2864 wxString name(*names);
2865 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2866 if ( conv->IsOk() )
2867 {
2868 gs_nameCache[encoding] = *names;
2869 return conv;
2870 }
2871
2872 delete conv;
2873 }
2874
2875 gs_nameCache[encoding] = _T(""); // cache the failure
2876 }
2877 }
2878 #endif // wxUSE_FONTMAP
2879 }
2880 #endif // HAVE_ICONV
2881
2882 #ifdef wxHAVE_WIN32_MB2WC
2883 {
2884 #if wxUSE_FONTMAP
2885 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2886 : new wxMBConv_win32(m_encoding);
2887 if ( conv->IsOk() )
2888 return conv;
2889
2890 delete conv;
2891 #else
2892 return NULL;
2893 #endif
2894 }
2895 #endif // wxHAVE_WIN32_MB2WC
2896
2897 #ifdef __DARWIN__
2898 {
2899 // leave UTF16 and UTF32 to the built-ins of wx
2900 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2901 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2902 {
2903 #if wxUSE_FONTMAP
2904 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2905 : new wxMBConv_cf(m_encoding);
2906 #else
2907 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2908 #endif
2909
2910 if ( conv->IsOk() )
2911 return conv;
2912
2913 delete conv;
2914 }
2915 }
2916 #endif // __DARWIN__
2917
2918 // step (2)
2919 wxFontEncoding enc = m_encoding;
2920 #if wxUSE_FONTMAP
2921 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2922 {
2923 // use "false" to suppress interactive dialogs -- we can be called from
2924 // anywhere and popping up a dialog from here is the last thing we want to
2925 // do
2926 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2927 }
2928 #endif // wxUSE_FONTMAP
2929
2930 switch ( enc )
2931 {
2932 case wxFONTENCODING_UTF7:
2933 return new wxMBConvUTF7;
2934
2935 case wxFONTENCODING_UTF8:
2936 return new wxMBConvUTF8;
2937
2938 case wxFONTENCODING_UTF16BE:
2939 return new wxMBConvUTF16BE;
2940
2941 case wxFONTENCODING_UTF16LE:
2942 return new wxMBConvUTF16LE;
2943
2944 case wxFONTENCODING_UTF32BE:
2945 return new wxMBConvUTF32BE;
2946
2947 case wxFONTENCODING_UTF32LE:
2948 return new wxMBConvUTF32LE;
2949
2950 default:
2951 // nothing to do but put here to suppress gcc warnings
2952 break;
2953 }
2954
2955 // step (3)
2956 #if wxUSE_FONTMAP
2957 {
2958 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2959 : new wxMBConv_wxwin(m_encoding);
2960 if ( conv->IsOk() )
2961 return conv;
2962
2963 delete conv;
2964 }
2965 #endif // wxUSE_FONTMAP
2966
2967 // NB: This is a hack to prevent deadlock. What could otherwise happen
2968 // in Unicode build: wxConvLocal creation ends up being here
2969 // because of some failure and logs the error. But wxLog will try to
2970 // attach a timestamp, for which it will need wxConvLocal (to convert
2971 // time to char* and then wchar_t*), but that fails, tries to log the
2972 // error, but wxLog has an (already locked) critical section that
2973 // guards the static buffer.
2974 static bool alreadyLoggingError = false;
2975 if (!alreadyLoggingError)
2976 {
2977 alreadyLoggingError = true;
2978 wxLogError(_("Cannot convert from the charset '%s'!"),
2979 m_name ? m_name
2980 :
2981 #if wxUSE_FONTMAP
2982 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2983 #else // !wxUSE_FONTMAP
2984 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2985 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2986 );
2987
2988 alreadyLoggingError = false;
2989 }
2990
2991 return NULL;
2992 }
2993
2994 void wxCSConv::CreateConvIfNeeded() const
2995 {
2996 if ( m_deferred )
2997 {
2998 wxCSConv *self = (wxCSConv *)this; // const_cast
2999
3000 // if we don't have neither the name nor the encoding, use the default
3001 // encoding for this system
3002 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3003 {
3004 #if wxUSE_INTL
3005 self->m_encoding = wxLocale::GetSystemEncoding();
3006 #else
3007 // fallback to some reasonable default:
3008 self->m_encoding = wxFONTENCODING_ISO8859_1;
3009 #endif // wxUSE_INTL
3010 }
3011
3012 self->m_convReal = DoCreate();
3013 self->m_deferred = false;
3014 }
3015 }
3016
3017 bool wxCSConv::IsOk() const
3018 {
3019 CreateConvIfNeeded();
3020
3021 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3022 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3023 return true; // always ok as we do it ourselves
3024
3025 // m_convReal->IsOk() is called at its own creation, so we know it must
3026 // be ok if m_convReal is non-NULL
3027 return m_convReal != NULL;
3028 }
3029
3030 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3031 const char *src, size_t srcLen) const
3032 {
3033 CreateConvIfNeeded();
3034
3035 if (m_convReal)
3036 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3037
3038 // latin-1 (direct)
3039 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3040 }
3041
3042 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3043 const wchar_t *src, size_t srcLen) const
3044 {
3045 CreateConvIfNeeded();
3046
3047 if (m_convReal)
3048 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3049
3050 // latin-1 (direct)
3051 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3052 }
3053
3054 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3055 {
3056 CreateConvIfNeeded();
3057
3058 if (m_convReal)
3059 return m_convReal->MB2WC(buf, psz, n);
3060
3061 // latin-1 (direct)
3062 size_t len = strlen(psz);
3063
3064 if (buf)
3065 {
3066 for (size_t c = 0; c <= len; c++)
3067 buf[c] = (unsigned char)(psz[c]);
3068 }
3069
3070 return len;
3071 }
3072
3073 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3074 {
3075 CreateConvIfNeeded();
3076
3077 if (m_convReal)
3078 return m_convReal->WC2MB(buf, psz, n);
3079
3080 // latin-1 (direct)
3081 const size_t len = wxWcslen(psz);
3082 if (buf)
3083 {
3084 for (size_t c = 0; c <= len; c++)
3085 {
3086 if (psz[c] > 0xFF)
3087 return wxCONV_FAILED;
3088
3089 buf[c] = (char)psz[c];
3090 }
3091 }
3092 else
3093 {
3094 for (size_t c = 0; c <= len; c++)
3095 {
3096 if (psz[c] > 0xFF)
3097 return wxCONV_FAILED;
3098 }
3099 }
3100
3101 return len;
3102 }
3103
3104 size_t wxCSConv::GetMBNulLen() const
3105 {
3106 CreateConvIfNeeded();
3107
3108 if ( m_convReal )
3109 {
3110 return m_convReal->GetMBNulLen();
3111 }
3112
3113 // otherwise, we are ISO-8859-1
3114 return 1;
3115 }
3116
3117 #if wxUSE_UNICODE_UTF8
3118 bool wxCSConv::IsUTF8() const
3119 {
3120 CreateConvIfNeeded();
3121
3122 if ( m_convReal )
3123 {
3124 return m_convReal->IsUTF8();
3125 }
3126
3127 // otherwise, we are ISO-8859-1
3128 return false;
3129 }
3130 #endif
3131
3132
3133 #if wxUSE_UNICODE
3134
3135 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3136 {
3137 if ( !s )
3138 return wxWCharBuffer();
3139
3140 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3141 if ( !wbuf )
3142 wbuf = wxMBConvUTF8().cMB2WX(s);
3143 if ( !wbuf )
3144 wbuf = wxConvISO8859_1.cMB2WX(s);
3145
3146 return wbuf;
3147 }
3148
3149 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3150 {
3151 if ( !ws )
3152 return wxCharBuffer();
3153
3154 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3155 if ( !buf )
3156 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3157
3158 return buf;
3159 }
3160
3161 #endif // wxUSE_UNICODE
3162
3163 // ----------------------------------------------------------------------------
3164 // globals
3165 // ----------------------------------------------------------------------------
3166
3167 // NB: The reason why we create converted objects in this convoluted way,
3168 // using a factory function instead of global variable, is that they
3169 // may be used at static initialization time (some of them are used by
3170 // wxString ctors and there may be a global wxString object). In other
3171 // words, possibly _before_ the converter global object would be
3172 // initialized.
3173
3174 #undef wxConvLibc
3175 #undef wxConvUTF8
3176 #undef wxConvUTF7
3177 #undef wxConvLocal
3178 #undef wxConvISO8859_1
3179
3180 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3181 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3182 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3183 { \
3184 static impl_klass name##Obj ctor_args; \
3185 return &name##Obj; \
3186 } \
3187 /* this ensures that all global converter objects are created */ \
3188 /* by the time static initialization is done, i.e. before any */ \
3189 /* thread is launched: */ \
3190 static klass* gs_##name##instance = wxGet_##name##Ptr()
3191
3192 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3193 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3194
3195 #ifdef __WINDOWS__
3196 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3197 #else
3198 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3199 #endif
3200
3201 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3202 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3203 // provokes an error message about "not enough macro parameters"; and we
3204 // can't use "()" here as the name##Obj declaration would be parsed as a
3205 // function declaration then, so use a semicolon and live with an extra
3206 // empty statement (and hope that no compilers warns about this)
3207 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3208 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3209
3210 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3211 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3212
3213 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3214 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3215
3216 #ifdef __DARWIN__
3217 // The xnu kernel always communicates file paths in decomposed UTF-8.
3218 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3219 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3220 #endif
3221
3222 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3223 #ifdef __DARWIN__
3224 &wxConvMacUTF8DObj;
3225 #else // !__DARWIN__
3226 wxGet_wxConvLibcPtr();
3227 #endif // __DARWIN__/!__DARWIN__
3228
3229 #else // !wxUSE_WCHAR_T
3230
3231 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3232 // stand-ins in absence of wchar_t
3233 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3234 wxConvISO8859_1,
3235 wxConvLocal,
3236 wxConvUTF8;
3237
3238 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T