]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
5c239e2728200f05a541d5dee30614632b0c471d
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __DARWIN__
56 #include "wx/mac/corefoundation/private/strconv_cf.h"
57 #endif //def __DARWIN__
58
59
60 #define TRACE_STRCONV _T("strconv")
61
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
63 // be 4 bytes
64 #if SIZEOF_WCHAR_T == 2
65 #define WC_UTF16
66 #endif
67
68
69 // ============================================================================
70 // implementation
71 // ============================================================================
72
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p, size_t n)
75 {
76 while ( n && *p++ == '\0' )
77 n--;
78
79 return n != 0;
80 }
81
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
85
86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
87 {
88 if (input <= 0xffff)
89 {
90 if (output)
91 *output = (wxUint16) input;
92
93 return 1;
94 }
95 else if (input >= 0x110000)
96 {
97 return wxCONV_FAILED;
98 }
99 else
100 {
101 if (output)
102 {
103 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
104 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
105 }
106
107 return 2;
108 }
109 }
110
111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
112 {
113 if ((*input < 0xd800) || (*input > 0xdfff))
114 {
115 output = *input;
116 return 1;
117 }
118 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
119 {
120 output = *input;
121 return wxCONV_FAILED;
122 }
123 else
124 {
125 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
126 return 2;
127 }
128 }
129
130 #ifdef WC_UTF16
131 typedef wchar_t wxDecodeSurrogate_t;
132 #else // !WC_UTF16
133 typedef wxUint16 wxDecodeSurrogate_t;
134 #endif // WC_UTF16/!WC_UTF16
135
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
138 //
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
140 // check for this
141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
142 {
143 wxUint32 out;
144 const size_t
145 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
146 if ( n == wxCONV_FAILED )
147 *pSrc = NULL;
148 else
149 *pSrc += n;
150
151 return out;
152 }
153
154 // ----------------------------------------------------------------------------
155 // wxMBConv
156 // ----------------------------------------------------------------------------
157
158 size_t
159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
160 const char *src, size_t srcLen) const
161 {
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
168
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten = 0;
171
172 // the number of NULs terminating this string
173 size_t nulLen = 0; // not really needed, but just to avoid warnings
174
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
178 // NULs at the end
179 wxCharBuffer bufTmp;
180 const char *srcEnd;
181 if ( srcLen != wxNO_LEN )
182 {
183 // we need to know how to find the end of this string
184 nulLen = GetMBNulLen();
185 if ( nulLen == wxCONV_FAILED )
186 return wxCONV_FAILED;
187
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
190 {
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
193 char * const p = bufTmp.data();
194 memcpy(p, src, srcLen);
195 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
196 *s = '\0';
197
198 src = bufTmp;
199 }
200
201 srcEnd = src + srcLen;
202 }
203 else // quit after the first loop iteration
204 {
205 srcEnd = NULL;
206 }
207
208 for ( ;; )
209 {
210 // try to convert the current chunk
211 size_t lenChunk = MB2WC(NULL, src, 0);
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
214
215 lenChunk++; // for the L'\0' at the end of this chunk
216
217 dstWritten += lenChunk;
218
219 if ( lenChunk == 1 )
220 {
221 // nothing left in the input string, conversion succeeded
222 break;
223 }
224
225 if ( dst )
226 {
227 if ( dstWritten > dstLen )
228 return wxCONV_FAILED;
229
230 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
231 return wxCONV_FAILED;
232
233 dst += lenChunk;
234 }
235
236 if ( !srcEnd )
237 {
238 // we convert just one chunk in this case as this is the entire
239 // string anyhow
240 break;
241 }
242
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src, nulLen) )
245 {
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
250 src += nulLen;
251 }
252
253 src += nulLen; // skipping over its terminator as well
254
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
258 if ( src >= srcEnd )
259 break;
260 }
261
262 return dstWritten;
263 }
264
265 size_t
266 wxMBConv::FromWChar(char *dst, size_t dstLen,
267 const wchar_t *src, size_t srcLen) const
268 {
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten = 0;
271
272 // make a copy of the input string unless it is already properly
273 // NUL-terminated
274 //
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp;
278 if ( srcLen == wxNO_LEN )
279 {
280 srcLen = wxWcslen(src) + 1;
281 }
282 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
283 {
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp = wxWCharBuffer(srcLen);
286 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
287 src = bufTmp;
288 }
289
290 const size_t lenNul = GetMBNulLen();
291 for ( const wchar_t * const srcEnd = src + srcLen;
292 src < srcEnd;
293 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
294 {
295 // try to convert the current chunk
296 size_t lenChunk = WC2MB(NULL, src, 0);
297
298 if ( lenChunk == wxCONV_FAILED )
299 return wxCONV_FAILED;
300
301 lenChunk += lenNul;
302 dstWritten += lenChunk;
303
304 if ( dst )
305 {
306 if ( dstWritten > dstLen )
307 return wxCONV_FAILED;
308
309 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
310 return wxCONV_FAILED;
311
312 dst += lenChunk;
313 }
314 }
315
316 return dstWritten;
317 }
318
319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
320 {
321 size_t rc = ToWChar(outBuff, outLen, inBuff);
322 if ( rc != wxCONV_FAILED )
323 {
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
326 rc--;
327 }
328
329 return rc;
330 }
331
332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
333 {
334 size_t rc = FromWChar(outBuff, outLen, inBuff);
335 if ( rc != wxCONV_FAILED )
336 {
337 rc -= GetMBNulLen();
338 }
339
340 return rc;
341 }
342
343 wxMBConv::~wxMBConv()
344 {
345 // nothing to do here (necessary for Darwin linking probably)
346 }
347
348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
349 {
350 if ( psz )
351 {
352 // calculate the length of the buffer needed first
353 const size_t nLen = ToWChar(NULL, 0, psz);
354 if ( nLen != wxCONV_FAILED )
355 {
356 // now do the actual conversion
357 wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
358
359 // +1 for the trailing NULL
360 if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
361 return buf;
362 }
363 }
364
365 return wxWCharBuffer();
366 }
367
368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
369 {
370 if ( pwz )
371 {
372 const size_t nLen = FromWChar(NULL, 0, pwz);
373 if ( nLen != wxCONV_FAILED )
374 {
375 wxCharBuffer buf(nLen - 1);
376 if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
377 return buf;
378 }
379 }
380
381 return wxCharBuffer();
382 }
383
384 const wxWCharBuffer
385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
386 {
387 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
388 if ( dstLen != wxCONV_FAILED )
389 {
390 // notice that we allocate space for dstLen+1 wide characters here
391 // because we want the buffer to always be NUL-terminated, even if the
392 // input isn't (as otherwise the caller has no way to know its length)
393 wxWCharBuffer wbuf(dstLen);
394 wbuf.data()[dstLen - 1] = L'\0';
395 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
396 {
397 if ( outLen )
398 {
399 *outLen = dstLen;
400 if ( wbuf[dstLen - 1] == L'\0' )
401 (*outLen)--;
402 }
403
404 return wbuf;
405 }
406 }
407
408 if ( outLen )
409 *outLen = 0;
410
411 return wxWCharBuffer();
412 }
413
414 const wxCharBuffer
415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
416 {
417 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
418 if ( dstLen != wxCONV_FAILED )
419 {
420 const size_t nulLen = GetMBNulLen();
421
422 // as above, ensure that the buffer is always NUL-terminated, even if
423 // the input is not
424 wxCharBuffer buf(dstLen + nulLen - 1);
425 memset(buf.data() + dstLen, 0, nulLen);
426 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
427 {
428 if ( outLen )
429 {
430 *outLen = dstLen;
431
432 if ( dstLen >= nulLen &&
433 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
434 {
435 // in this case the output is NUL-terminated and we're not
436 // supposed to count NUL
437 *outLen -= nulLen;
438 }
439 }
440
441 return buf;
442 }
443 }
444
445 if ( outLen )
446 *outLen = 0;
447
448 return wxCharBuffer();
449 }
450
451 // ----------------------------------------------------------------------------
452 // wxMBConvLibc
453 // ----------------------------------------------------------------------------
454
455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
456 {
457 return wxMB2WC(buf, psz, n);
458 }
459
460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
461 {
462 return wxWC2MB(buf, psz, n);
463 }
464
465 // ----------------------------------------------------------------------------
466 // wxConvBrokenFileNames
467 // ----------------------------------------------------------------------------
468
469 #ifdef __UNIX__
470
471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
472 {
473 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
474 wxStricmp(charset, _T("UTF8")) == 0 )
475 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
476 else
477 m_conv = new wxCSConv(charset);
478 }
479
480 #endif // __UNIX__
481
482 // ----------------------------------------------------------------------------
483 // UTF-7
484 // ----------------------------------------------------------------------------
485
486 // Implementation (C) 2004 Fredrik Roubert
487
488 //
489 // BASE64 decoding table
490 //
491 static const unsigned char utf7unb64[] =
492 {
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
499 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
500 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
502 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
503 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
504 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
506 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
507 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
508 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
525 };
526
527 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
528 {
529 size_t len = 0;
530
531 while ( *psz && (!buf || (len < n)) )
532 {
533 unsigned char cc = *psz++;
534 if (cc != '+')
535 {
536 // plain ASCII char
537 if (buf)
538 *buf++ = cc;
539 len++;
540 }
541 else if (*psz == '-')
542 {
543 // encoded plus sign
544 if (buf)
545 *buf++ = cc;
546 len++;
547 psz++;
548 }
549 else // start of BASE64 encoded string
550 {
551 bool lsb, ok;
552 unsigned int d, l;
553 for ( ok = lsb = false, d = 0, l = 0;
554 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
555 psz++ )
556 {
557 d <<= 6;
558 d += cc;
559 for (l += 6; l >= 8; lsb = !lsb)
560 {
561 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
562 if (lsb)
563 {
564 if (buf)
565 *buf++ |= c;
566 len ++;
567 ok = true;
568 }
569 else
570 {
571 if (buf)
572 *buf = (wchar_t)(c << 8);
573 }
574 }
575 }
576
577 if ( !ok )
578 {
579 // in valid UTF7 we should have valid characters after '+'
580 return wxCONV_FAILED;
581 }
582
583 if (*psz == '-')
584 psz++;
585 }
586 }
587
588 if ( buf && (len < n) )
589 *buf = '\0';
590
591 return len;
592 }
593
594 //
595 // BASE64 encoding table
596 //
597 static const unsigned char utf7enb64[] =
598 {
599 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
600 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
601 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
602 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
603 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
604 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
605 'w', 'x', 'y', 'z', '0', '1', '2', '3',
606 '4', '5', '6', '7', '8', '9', '+', '/'
607 };
608
609 //
610 // UTF-7 encoding table
611 //
612 // 0 - Set D (directly encoded characters)
613 // 1 - Set O (optional direct characters)
614 // 2 - whitespace characters (optional)
615 // 3 - special characters
616 //
617 static const unsigned char utf7encode[128] =
618 {
619 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
620 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
621 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
623 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
625 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
627 };
628
629 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
630 {
631 size_t len = 0;
632
633 while (*psz && ((!buf) || (len < n)))
634 {
635 wchar_t cc = *psz++;
636 if (cc < 0x80 && utf7encode[cc] < 1)
637 {
638 // plain ASCII char
639 if (buf)
640 *buf++ = (char)cc;
641
642 len++;
643 }
644 #ifndef WC_UTF16
645 else if (((wxUint32)cc) > 0xffff)
646 {
647 // no surrogate pair generation (yet?)
648 return wxCONV_FAILED;
649 }
650 #endif
651 else
652 {
653 if (buf)
654 *buf++ = '+';
655
656 len++;
657 if (cc != '+')
658 {
659 // BASE64 encode string
660 unsigned int lsb, d, l;
661 for (d = 0, l = 0; /*nothing*/; psz++)
662 {
663 for (lsb = 0; lsb < 2; lsb ++)
664 {
665 d <<= 8;
666 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
667
668 for (l += 8; l >= 6; )
669 {
670 l -= 6;
671 if (buf)
672 *buf++ = utf7enb64[(d >> l) % 64];
673 len++;
674 }
675 }
676
677 cc = *psz;
678 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
679 break;
680 }
681
682 if (l != 0)
683 {
684 if (buf)
685 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
686
687 len++;
688 }
689 }
690
691 if (buf)
692 *buf++ = '-';
693 len++;
694 }
695 }
696
697 if (buf && (len < n))
698 *buf = 0;
699
700 return len;
701 }
702
703 // ----------------------------------------------------------------------------
704 // UTF-8
705 // ----------------------------------------------------------------------------
706
707 static const wxUint32 utf8_max[]=
708 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
709
710 // boundaries of the private use area we use to (temporarily) remap invalid
711 // characters invalid in a UTF-8 encoded string
712 const wxUint32 wxUnicodePUA = 0x100000;
713 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
714
715 // this table gives the length of the UTF-8 encoding from its first character:
716 const unsigned char tableUtf8Lengths[256] = {
717 // single-byte sequences (ASCII):
718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
726
727 // these are invalid:
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
732 0, 0, // C0,C1
733
734 // two-byte sequences:
735 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
736 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
737
738 // three-byte sequences:
739 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
740
741 // four-byte sequences:
742 4, 4, 4, 4, 4, // F0..F4
743
744 // these are invalid again (5- or 6-byte
745 // sequences and sequences for code points
746 // above U+10FFFF, as restricted by RFC 3629):
747 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
748 };
749
750 size_t
751 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
752 const char *src, size_t srcLen) const
753 {
754 wchar_t *out = dstLen ? dst : NULL;
755 size_t written = 0;
756
757 if ( srcLen == wxNO_LEN )
758 srcLen = strlen(src) + 1;
759
760 for ( const char *p = src; ; p++ )
761 {
762 if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
763 {
764 // all done successfully, just add the trailing NULL if we are not
765 // using explicit length
766 if ( srcLen == wxNO_LEN )
767 {
768 if ( out )
769 {
770 if ( !dstLen )
771 break;
772
773 *out = L'\0';
774 }
775
776 written++;
777 }
778
779 return written;
780 }
781
782 if ( out && !dstLen-- )
783 break;
784
785 wxUint32 code;
786 unsigned char c = *p;
787
788 if ( c < 0x80 )
789 {
790 if ( srcLen == 0 ) // the test works for wxNO_LEN too
791 break;
792
793 if ( srcLen != wxNO_LEN )
794 srcLen--;
795
796 code = c;
797 }
798 else
799 {
800 unsigned len = tableUtf8Lengths[c];
801 if ( !len )
802 break;
803
804 if ( srcLen < len ) // the test works for wxNO_LEN too
805 break;
806
807 if ( srcLen != wxNO_LEN )
808 srcLen -= len;
809
810 // Char. number range | UTF-8 octet sequence
811 // (hexadecimal) | (binary)
812 // ----------------------+----------------------------------------
813 // 0000 0000 - 0000 007F | 0xxxxxxx
814 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
815 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
816 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
817 //
818 // Code point value is stored in bits marked with 'x',
819 // lowest-order bit of the value on the right side in the diagram
820 // above. (from RFC 3629)
821
822 // mask to extract lead byte's value ('x' bits above), by sequence
823 // length:
824 static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
825
826 // mask and value of lead byte's most significant bits, by length:
827 static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
828 static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
829
830 len--; // it's more convenient to work with 0-based length here
831
832 // extract the lead byte's value bits:
833 if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
834 break;
835
836 code = c & leadValueMask[len];
837
838 // all remaining bytes, if any, are handled in the same way
839 // regardless of sequence's length:
840 for ( ; len; --len )
841 {
842 c = *++p;
843 if ( (c & 0xC0) != 0x80 )
844 return wxCONV_FAILED;
845
846 code <<= 6;
847 code |= c & 0x3F;
848 }
849 }
850
851 #ifdef WC_UTF16
852 // cast is ok because wchar_t == wxUint16 if WC_UTF16
853 if ( encode_utf16(code, (wxUint16 *)out) == 2 )
854 {
855 if ( out )
856 out++;
857 written++;
858 }
859 #else // !WC_UTF16
860 if ( out )
861 *out = code;
862 #endif // WC_UTF16/!WC_UTF16
863
864 if ( out )
865 out++;
866
867 written++;
868 }
869
870 return wxCONV_FAILED;
871 }
872
873 size_t
874 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
875 const wchar_t *src, size_t srcLen) const
876 {
877 char *out = dstLen ? dst : NULL;
878 size_t written = 0;
879
880 for ( const wchar_t *wp = src; ; wp++ )
881 {
882 if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
883 {
884 // all done successfully, just add the trailing NULL if we are not
885 // using explicit length
886 if ( srcLen == wxNO_LEN )
887 {
888 if ( out )
889 {
890 if ( !dstLen )
891 break;
892
893 *out = '\0';
894 }
895
896 written++;
897 }
898
899 return written;
900 }
901
902
903 wxUint32 code;
904 #ifdef WC_UTF16
905 // cast is ok for WC_UTF16
906 if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
907 {
908 // skip the next char too as we decoded a surrogate
909 wp++;
910 }
911 #else // wchar_t is UTF-32
912 code = *wp & 0x7fffffff;
913 #endif
914
915 unsigned len;
916 if ( code <= 0x7F )
917 {
918 len = 1;
919 if ( out )
920 {
921 if ( dstLen < len )
922 break;
923
924 out[0] = (char)code;
925 }
926 }
927 else if ( code <= 0x07FF )
928 {
929 len = 2;
930 if ( out )
931 {
932 if ( dstLen < len )
933 break;
934
935 // NB: this line takes 6 least significant bits, encodes them as
936 // 10xxxxxx and discards them so that the next byte can be encoded:
937 out[1] = 0x80 | (code & 0x3F); code >>= 6;
938 out[0] = 0xC0 | code;
939 }
940 }
941 else if ( code < 0xFFFF )
942 {
943 len = 3;
944 if ( out )
945 {
946 if ( dstLen < len )
947 break;
948
949 out[2] = 0x80 | (code & 0x3F); code >>= 6;
950 out[1] = 0x80 | (code & 0x3F); code >>= 6;
951 out[0] = 0xE0 | code;
952 }
953 }
954 else if ( code <= 0x10FFFF )
955 {
956 len = 4;
957 if ( out )
958 {
959 if ( dstLen < len )
960 break;
961
962 out[3] = 0x80 | (code & 0x3F); code >>= 6;
963 out[2] = 0x80 | (code & 0x3F); code >>= 6;
964 out[1] = 0x80 | (code & 0x3F); code >>= 6;
965 out[0] = 0xF0 | code;
966 }
967 }
968 else
969 {
970 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
971 break;
972 }
973
974 if ( out )
975 {
976 out += len;
977 dstLen -= len;
978 }
979
980 written += len;
981 }
982
983 // we only get here if an error occurs during decoding
984 return wxCONV_FAILED;
985 }
986
987 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
988 const char *psz, size_t srcLen) const
989 {
990 if ( m_options == MAP_INVALID_UTF8_NOT )
991 return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
992
993 size_t len = 0;
994
995 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
996 {
997 const char *opsz = psz;
998 bool invalid = false;
999 unsigned char cc = *psz++, fc = cc;
1000 unsigned cnt;
1001 for (cnt = 0; fc & 0x80; cnt++)
1002 fc <<= 1;
1003
1004 if (!cnt)
1005 {
1006 // plain ASCII char
1007 if (buf)
1008 *buf++ = cc;
1009 len++;
1010
1011 // escape the escape character for octal escapes
1012 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1013 && cc == '\\' && (!buf || len < n))
1014 {
1015 if (buf)
1016 *buf++ = cc;
1017 len++;
1018 }
1019 }
1020 else
1021 {
1022 cnt--;
1023 if (!cnt)
1024 {
1025 // invalid UTF-8 sequence
1026 invalid = true;
1027 }
1028 else
1029 {
1030 unsigned ocnt = cnt - 1;
1031 wxUint32 res = cc & (0x3f >> cnt);
1032 while (cnt--)
1033 {
1034 cc = *psz;
1035 if ((cc & 0xC0) != 0x80)
1036 {
1037 // invalid UTF-8 sequence
1038 invalid = true;
1039 break;
1040 }
1041
1042 psz++;
1043 res = (res << 6) | (cc & 0x3f);
1044 }
1045
1046 if (invalid || res <= utf8_max[ocnt])
1047 {
1048 // illegal UTF-8 encoding
1049 invalid = true;
1050 }
1051 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1052 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1053 {
1054 // if one of our PUA characters turns up externally
1055 // it must also be treated as an illegal sequence
1056 // (a bit like you have to escape an escape character)
1057 invalid = true;
1058 }
1059 else
1060 {
1061 #ifdef WC_UTF16
1062 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1063 size_t pa = encode_utf16(res, (wxUint16 *)buf);
1064 if (pa == wxCONV_FAILED)
1065 {
1066 invalid = true;
1067 }
1068 else
1069 {
1070 if (buf)
1071 buf += pa;
1072 len += pa;
1073 }
1074 #else // !WC_UTF16
1075 if (buf)
1076 *buf++ = (wchar_t)res;
1077 len++;
1078 #endif // WC_UTF16/!WC_UTF16
1079 }
1080 }
1081
1082 if (invalid)
1083 {
1084 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1085 {
1086 while (opsz < psz && (!buf || len < n))
1087 {
1088 #ifdef WC_UTF16
1089 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1090 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1091 wxASSERT(pa != wxCONV_FAILED);
1092 if (buf)
1093 buf += pa;
1094 opsz++;
1095 len += pa;
1096 #else
1097 if (buf)
1098 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1099 opsz++;
1100 len++;
1101 #endif
1102 }
1103 }
1104 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1105 {
1106 while (opsz < psz && (!buf || len < n))
1107 {
1108 if ( buf && len + 3 < n )
1109 {
1110 unsigned char on = *opsz;
1111 *buf++ = L'\\';
1112 *buf++ = (wchar_t)( L'0' + on / 0100 );
1113 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1114 *buf++ = (wchar_t)( L'0' + on % 010 );
1115 }
1116
1117 opsz++;
1118 len += 4;
1119 }
1120 }
1121 else // MAP_INVALID_UTF8_NOT
1122 {
1123 return wxCONV_FAILED;
1124 }
1125 }
1126 }
1127 }
1128
1129 if (srcLen == wxNO_LEN && buf && (len < n))
1130 *buf = 0;
1131
1132 return len + 1;
1133 }
1134
1135 static inline bool isoctal(wchar_t wch)
1136 {
1137 return L'0' <= wch && wch <= L'7';
1138 }
1139
1140 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1141 const wchar_t *psz, size_t srcLen) const
1142 {
1143 if ( m_options == MAP_INVALID_UTF8_NOT )
1144 return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1145
1146 size_t len = 0;
1147
1148 while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1149 {
1150 wxUint32 cc;
1151
1152 #ifdef WC_UTF16
1153 // cast is ok for WC_UTF16
1154 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1155 psz += (pa == wxCONV_FAILED) ? 1 : pa;
1156 #else
1157 cc = (*psz++) & 0x7fffffff;
1158 #endif
1159
1160 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1161 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1162 {
1163 if (buf)
1164 *buf++ = (char)(cc - wxUnicodePUA);
1165 len++;
1166 }
1167 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1168 && cc == L'\\' && psz[0] == L'\\' )
1169 {
1170 if (buf)
1171 *buf++ = (char)cc;
1172 psz++;
1173 len++;
1174 }
1175 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1176 cc == L'\\' &&
1177 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1178 {
1179 if (buf)
1180 {
1181 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1182 (psz[1] - L'0') * 010 +
1183 (psz[2] - L'0'));
1184 }
1185
1186 psz += 3;
1187 len++;
1188 }
1189 else
1190 {
1191 unsigned cnt;
1192 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1193 {
1194 }
1195
1196 if (!cnt)
1197 {
1198 // plain ASCII char
1199 if (buf)
1200 *buf++ = (char) cc;
1201 len++;
1202 }
1203 else
1204 {
1205 len += cnt + 1;
1206 if (buf)
1207 {
1208 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1209 while (cnt--)
1210 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1211 }
1212 }
1213 }
1214 }
1215
1216 if (srcLen == wxNO_LEN && buf && (len < n))
1217 *buf = 0;
1218
1219 return len + 1;
1220 }
1221
1222 // ============================================================================
1223 // UTF-16
1224 // ============================================================================
1225
1226 #ifdef WORDS_BIGENDIAN
1227 #define wxMBConvUTF16straight wxMBConvUTF16BE
1228 #define wxMBConvUTF16swap wxMBConvUTF16LE
1229 #else
1230 #define wxMBConvUTF16swap wxMBConvUTF16BE
1231 #define wxMBConvUTF16straight wxMBConvUTF16LE
1232 #endif
1233
1234 /* static */
1235 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1236 {
1237 if ( srcLen == wxNO_LEN )
1238 {
1239 // count the number of bytes in input, including the trailing NULs
1240 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1241 for ( srcLen = 1; *inBuff++; srcLen++ )
1242 ;
1243
1244 srcLen *= BYTES_PER_CHAR;
1245 }
1246 else // we already have the length
1247 {
1248 // we can only convert an entire number of UTF-16 characters
1249 if ( srcLen % BYTES_PER_CHAR )
1250 return wxCONV_FAILED;
1251 }
1252
1253 return srcLen;
1254 }
1255
1256 // case when in-memory representation is UTF-16 too
1257 #ifdef WC_UTF16
1258
1259 // ----------------------------------------------------------------------------
1260 // conversions without endianness change
1261 // ----------------------------------------------------------------------------
1262
1263 size_t
1264 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1265 const char *src, size_t srcLen) const
1266 {
1267 // set up the scene for using memcpy() (which is presumably more efficient
1268 // than copying the bytes one by one)
1269 srcLen = GetLength(src, srcLen);
1270 if ( srcLen == wxNO_LEN )
1271 return wxCONV_FAILED;
1272
1273 const size_t inLen = srcLen / BYTES_PER_CHAR;
1274 if ( dst )
1275 {
1276 if ( dstLen < inLen )
1277 return wxCONV_FAILED;
1278
1279 memcpy(dst, src, srcLen);
1280 }
1281
1282 return inLen;
1283 }
1284
1285 size_t
1286 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1287 const wchar_t *src, size_t srcLen) const
1288 {
1289 if ( srcLen == wxNO_LEN )
1290 srcLen = wxWcslen(src) + 1;
1291
1292 srcLen *= BYTES_PER_CHAR;
1293
1294 if ( dst )
1295 {
1296 if ( dstLen < srcLen )
1297 return wxCONV_FAILED;
1298
1299 memcpy(dst, src, srcLen);
1300 }
1301
1302 return srcLen;
1303 }
1304
1305 // ----------------------------------------------------------------------------
1306 // endian-reversing conversions
1307 // ----------------------------------------------------------------------------
1308
1309 size_t
1310 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1311 const char *src, size_t srcLen) const
1312 {
1313 srcLen = GetLength(src, srcLen);
1314 if ( srcLen == wxNO_LEN )
1315 return wxCONV_FAILED;
1316
1317 srcLen /= BYTES_PER_CHAR;
1318
1319 if ( dst )
1320 {
1321 if ( dstLen < srcLen )
1322 return wxCONV_FAILED;
1323
1324 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1325 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1326 {
1327 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1328 }
1329 }
1330
1331 return srcLen;
1332 }
1333
1334 size_t
1335 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1336 const wchar_t *src, size_t srcLen) const
1337 {
1338 if ( srcLen == wxNO_LEN )
1339 srcLen = wxWcslen(src) + 1;
1340
1341 srcLen *= BYTES_PER_CHAR;
1342
1343 if ( dst )
1344 {
1345 if ( dstLen < srcLen )
1346 return wxCONV_FAILED;
1347
1348 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1349 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1350 {
1351 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1352 }
1353 }
1354
1355 return srcLen;
1356 }
1357
1358 #else // !WC_UTF16: wchar_t is UTF-32
1359
1360 // ----------------------------------------------------------------------------
1361 // conversions without endianness change
1362 // ----------------------------------------------------------------------------
1363
1364 size_t
1365 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1366 const char *src, size_t srcLen) const
1367 {
1368 srcLen = GetLength(src, srcLen);
1369 if ( srcLen == wxNO_LEN )
1370 return wxCONV_FAILED;
1371
1372 const size_t inLen = srcLen / BYTES_PER_CHAR;
1373 if ( !dst )
1374 {
1375 // optimization: return maximal space which could be needed for this
1376 // string even if the real size could be smaller if the buffer contains
1377 // any surrogates
1378 return inLen;
1379 }
1380
1381 size_t outLen = 0;
1382 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1383 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1384 {
1385 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1386 if ( !inBuff )
1387 return wxCONV_FAILED;
1388
1389 if ( ++outLen > dstLen )
1390 return wxCONV_FAILED;
1391
1392 *dst++ = ch;
1393 }
1394
1395
1396 return outLen;
1397 }
1398
1399 size_t
1400 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1401 const wchar_t *src, size_t srcLen) const
1402 {
1403 if ( srcLen == wxNO_LEN )
1404 srcLen = wxWcslen(src) + 1;
1405
1406 size_t outLen = 0;
1407 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1408 for ( size_t n = 0; n < srcLen; n++ )
1409 {
1410 wxUint16 cc[2];
1411 const size_t numChars = encode_utf16(*src++, cc);
1412 if ( numChars == wxCONV_FAILED )
1413 return wxCONV_FAILED;
1414
1415 outLen += numChars * BYTES_PER_CHAR;
1416 if ( outBuff )
1417 {
1418 if ( outLen > dstLen )
1419 return wxCONV_FAILED;
1420
1421 *outBuff++ = cc[0];
1422 if ( numChars == 2 )
1423 {
1424 // second character of a surrogate
1425 *outBuff++ = cc[1];
1426 }
1427 }
1428 }
1429
1430 return outLen;
1431 }
1432
1433 // ----------------------------------------------------------------------------
1434 // endian-reversing conversions
1435 // ----------------------------------------------------------------------------
1436
1437 size_t
1438 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1439 const char *src, size_t srcLen) const
1440 {
1441 srcLen = GetLength(src, srcLen);
1442 if ( srcLen == wxNO_LEN )
1443 return wxCONV_FAILED;
1444
1445 const size_t inLen = srcLen / BYTES_PER_CHAR;
1446 if ( !dst )
1447 {
1448 // optimization: return maximal space which could be needed for this
1449 // string even if the real size could be smaller if the buffer contains
1450 // any surrogates
1451 return inLen;
1452 }
1453
1454 size_t outLen = 0;
1455 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1456 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1457 {
1458 wxUint32 ch;
1459 wxUint16 tmp[2];
1460
1461 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1462 inBuff++;
1463 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1464
1465 const size_t numChars = decode_utf16(tmp, ch);
1466 if ( numChars == wxCONV_FAILED )
1467 return wxCONV_FAILED;
1468
1469 if ( numChars == 2 )
1470 inBuff++;
1471
1472 if ( ++outLen > dstLen )
1473 return wxCONV_FAILED;
1474
1475 *dst++ = ch;
1476 }
1477
1478
1479 return outLen;
1480 }
1481
1482 size_t
1483 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1484 const wchar_t *src, size_t srcLen) const
1485 {
1486 if ( srcLen == wxNO_LEN )
1487 srcLen = wxWcslen(src) + 1;
1488
1489 size_t outLen = 0;
1490 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1491 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1492 {
1493 wxUint16 cc[2];
1494 const size_t numChars = encode_utf16(*src, cc);
1495 if ( numChars == wxCONV_FAILED )
1496 return wxCONV_FAILED;
1497
1498 outLen += numChars * BYTES_PER_CHAR;
1499 if ( outBuff )
1500 {
1501 if ( outLen > dstLen )
1502 return wxCONV_FAILED;
1503
1504 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1505 if ( numChars == 2 )
1506 {
1507 // second character of a surrogate
1508 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1509 }
1510 }
1511 }
1512
1513 return outLen;
1514 }
1515
1516 #endif // WC_UTF16/!WC_UTF16
1517
1518
1519 // ============================================================================
1520 // UTF-32
1521 // ============================================================================
1522
1523 #ifdef WORDS_BIGENDIAN
1524 #define wxMBConvUTF32straight wxMBConvUTF32BE
1525 #define wxMBConvUTF32swap wxMBConvUTF32LE
1526 #else
1527 #define wxMBConvUTF32swap wxMBConvUTF32BE
1528 #define wxMBConvUTF32straight wxMBConvUTF32LE
1529 #endif
1530
1531
1532 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1533 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1534
1535 /* static */
1536 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1537 {
1538 if ( srcLen == wxNO_LEN )
1539 {
1540 // count the number of bytes in input, including the trailing NULs
1541 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1542 for ( srcLen = 1; *inBuff++; srcLen++ )
1543 ;
1544
1545 srcLen *= BYTES_PER_CHAR;
1546 }
1547 else // we already have the length
1548 {
1549 // we can only convert an entire number of UTF-32 characters
1550 if ( srcLen % BYTES_PER_CHAR )
1551 return wxCONV_FAILED;
1552 }
1553
1554 return srcLen;
1555 }
1556
1557 // case when in-memory representation is UTF-16
1558 #ifdef WC_UTF16
1559
1560 // ----------------------------------------------------------------------------
1561 // conversions without endianness change
1562 // ----------------------------------------------------------------------------
1563
1564 size_t
1565 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1566 const char *src, size_t srcLen) const
1567 {
1568 srcLen = GetLength(src, srcLen);
1569 if ( srcLen == wxNO_LEN )
1570 return wxCONV_FAILED;
1571
1572 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1573 const size_t inLen = srcLen / BYTES_PER_CHAR;
1574 size_t outLen = 0;
1575 for ( size_t n = 0; n < inLen; n++ )
1576 {
1577 wxUint16 cc[2];
1578 const size_t numChars = encode_utf16(*inBuff++, cc);
1579 if ( numChars == wxCONV_FAILED )
1580 return wxCONV_FAILED;
1581
1582 outLen += numChars;
1583 if ( dst )
1584 {
1585 if ( outLen > dstLen )
1586 return wxCONV_FAILED;
1587
1588 *dst++ = cc[0];
1589 if ( numChars == 2 )
1590 {
1591 // second character of a surrogate
1592 *dst++ = cc[1];
1593 }
1594 }
1595 }
1596
1597 return outLen;
1598 }
1599
1600 size_t
1601 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1602 const wchar_t *src, size_t srcLen) const
1603 {
1604 if ( srcLen == wxNO_LEN )
1605 srcLen = wxWcslen(src) + 1;
1606
1607 if ( !dst )
1608 {
1609 // optimization: return maximal space which could be needed for this
1610 // string instead of the exact amount which could be less if there are
1611 // any surrogates in the input
1612 //
1613 // we consider that surrogates are rare enough to make it worthwhile to
1614 // avoid running the loop below at the cost of slightly extra memory
1615 // consumption
1616 return srcLen * BYTES_PER_CHAR;
1617 }
1618
1619 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1620 size_t outLen = 0;
1621 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1622 {
1623 const wxUint32 ch = wxDecodeSurrogate(&src);
1624 if ( !src )
1625 return wxCONV_FAILED;
1626
1627 outLen += BYTES_PER_CHAR;
1628
1629 if ( outLen > dstLen )
1630 return wxCONV_FAILED;
1631
1632 *outBuff++ = ch;
1633 }
1634
1635 return outLen;
1636 }
1637
1638 // ----------------------------------------------------------------------------
1639 // endian-reversing conversions
1640 // ----------------------------------------------------------------------------
1641
1642 size_t
1643 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1644 const char *src, size_t srcLen) const
1645 {
1646 srcLen = GetLength(src, srcLen);
1647 if ( srcLen == wxNO_LEN )
1648 return wxCONV_FAILED;
1649
1650 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1651 const size_t inLen = srcLen / BYTES_PER_CHAR;
1652 size_t outLen = 0;
1653 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1654 {
1655 wxUint16 cc[2];
1656 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1657 if ( numChars == wxCONV_FAILED )
1658 return wxCONV_FAILED;
1659
1660 outLen += numChars;
1661 if ( dst )
1662 {
1663 if ( outLen > dstLen )
1664 return wxCONV_FAILED;
1665
1666 *dst++ = cc[0];
1667 if ( numChars == 2 )
1668 {
1669 // second character of a surrogate
1670 *dst++ = cc[1];
1671 }
1672 }
1673 }
1674
1675 return outLen;
1676 }
1677
1678 size_t
1679 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1680 const wchar_t *src, size_t srcLen) const
1681 {
1682 if ( srcLen == wxNO_LEN )
1683 srcLen = wxWcslen(src) + 1;
1684
1685 if ( !dst )
1686 {
1687 // optimization: return maximal space which could be needed for this
1688 // string instead of the exact amount which could be less if there are
1689 // any surrogates in the input
1690 //
1691 // we consider that surrogates are rare enough to make it worthwhile to
1692 // avoid running the loop below at the cost of slightly extra memory
1693 // consumption
1694 return srcLen*BYTES_PER_CHAR;
1695 }
1696
1697 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1698 size_t outLen = 0;
1699 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1700 {
1701 const wxUint32 ch = wxDecodeSurrogate(&src);
1702 if ( !src )
1703 return wxCONV_FAILED;
1704
1705 outLen += BYTES_PER_CHAR;
1706
1707 if ( outLen > dstLen )
1708 return wxCONV_FAILED;
1709
1710 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1711 }
1712
1713 return outLen;
1714 }
1715
1716 #else // !WC_UTF16: wchar_t is UTF-32
1717
1718 // ----------------------------------------------------------------------------
1719 // conversions without endianness change
1720 // ----------------------------------------------------------------------------
1721
1722 size_t
1723 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1724 const char *src, size_t srcLen) const
1725 {
1726 // use memcpy() as it should be much faster than hand-written loop
1727 srcLen = GetLength(src, srcLen);
1728 if ( srcLen == wxNO_LEN )
1729 return wxCONV_FAILED;
1730
1731 const size_t inLen = srcLen/BYTES_PER_CHAR;
1732 if ( dst )
1733 {
1734 if ( dstLen < inLen )
1735 return wxCONV_FAILED;
1736
1737 memcpy(dst, src, srcLen);
1738 }
1739
1740 return inLen;
1741 }
1742
1743 size_t
1744 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1745 const wchar_t *src, size_t srcLen) const
1746 {
1747 if ( srcLen == wxNO_LEN )
1748 srcLen = wxWcslen(src) + 1;
1749
1750 srcLen *= BYTES_PER_CHAR;
1751
1752 if ( dst )
1753 {
1754 if ( dstLen < srcLen )
1755 return wxCONV_FAILED;
1756
1757 memcpy(dst, src, srcLen);
1758 }
1759
1760 return srcLen;
1761 }
1762
1763 // ----------------------------------------------------------------------------
1764 // endian-reversing conversions
1765 // ----------------------------------------------------------------------------
1766
1767 size_t
1768 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1769 const char *src, size_t srcLen) const
1770 {
1771 srcLen = GetLength(src, srcLen);
1772 if ( srcLen == wxNO_LEN )
1773 return wxCONV_FAILED;
1774
1775 srcLen /= BYTES_PER_CHAR;
1776
1777 if ( dst )
1778 {
1779 if ( dstLen < srcLen )
1780 return wxCONV_FAILED;
1781
1782 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1783 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1784 {
1785 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1786 }
1787 }
1788
1789 return srcLen;
1790 }
1791
1792 size_t
1793 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1794 const wchar_t *src, size_t srcLen) const
1795 {
1796 if ( srcLen == wxNO_LEN )
1797 srcLen = wxWcslen(src) + 1;
1798
1799 srcLen *= BYTES_PER_CHAR;
1800
1801 if ( dst )
1802 {
1803 if ( dstLen < srcLen )
1804 return wxCONV_FAILED;
1805
1806 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1807 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1808 {
1809 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1810 }
1811 }
1812
1813 return srcLen;
1814 }
1815
1816 #endif // WC_UTF16/!WC_UTF16
1817
1818
1819 // ============================================================================
1820 // The classes doing conversion using the iconv_xxx() functions
1821 // ============================================================================
1822
1823 #ifdef HAVE_ICONV
1824
1825 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1826 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1827 // (unless there's yet another bug in glibc) the only case when iconv()
1828 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1829 // left in the input buffer -- when _real_ error occurs,
1830 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1831 // iconv() failure.
1832 // [This bug does not appear in glibc 2.2.]
1833 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1834 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1835 (errno != E2BIG || bufLeft != 0))
1836 #else
1837 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1838 #endif
1839
1840 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1841
1842 #define ICONV_T_INVALID ((iconv_t)-1)
1843
1844 #if SIZEOF_WCHAR_T == 4
1845 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1846 #define WC_ENC wxFONTENCODING_UTF32
1847 #elif SIZEOF_WCHAR_T == 2
1848 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1849 #define WC_ENC wxFONTENCODING_UTF16
1850 #else // sizeof(wchar_t) != 2 nor 4
1851 // does this ever happen?
1852 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1853 #endif
1854
1855 // ----------------------------------------------------------------------------
1856 // wxMBConv_iconv: encapsulates an iconv character set
1857 // ----------------------------------------------------------------------------
1858
1859 class wxMBConv_iconv : public wxMBConv
1860 {
1861 public:
1862 wxMBConv_iconv(const char *name);
1863 virtual ~wxMBConv_iconv();
1864
1865 // implement base class virtual methods
1866 virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1867 const char *src, size_t srcLen = wxNO_LEN) const;
1868 virtual size_t FromWChar(char *dst, size_t dstLen,
1869 const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1870 virtual size_t GetMBNulLen() const;
1871
1872 #if wxUSE_UNICODE_UTF8
1873 virtual bool IsUTF8() const;
1874 #endif
1875
1876 virtual wxMBConv *Clone() const
1877 {
1878 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1879 p->m_minMBCharWidth = m_minMBCharWidth;
1880 return p;
1881 }
1882
1883 bool IsOk() const
1884 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1885
1886 protected:
1887 // the iconv handlers used to translate from multibyte
1888 // to wide char and in the other direction
1889 iconv_t m2w,
1890 w2m;
1891
1892 #if wxUSE_THREADS
1893 // guards access to m2w and w2m objects
1894 wxMutex m_iconvMutex;
1895 #endif
1896
1897 private:
1898 // the name (for iconv_open()) of a wide char charset -- if none is
1899 // available on this machine, it will remain NULL
1900 static wxString ms_wcCharsetName;
1901
1902 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1903 // different endian-ness than the native one
1904 static bool ms_wcNeedsSwap;
1905
1906
1907 // name of the encoding handled by this conversion
1908 wxString m_name;
1909
1910 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1911 // initially
1912 size_t m_minMBCharWidth;
1913 };
1914
1915 // make the constructor available for unit testing
1916 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1917 {
1918 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1919 if ( !result->IsOk() )
1920 {
1921 delete result;
1922 return 0;
1923 }
1924
1925 return result;
1926 }
1927
1928 wxString wxMBConv_iconv::ms_wcCharsetName;
1929 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1930
1931 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1932 : m_name(name)
1933 {
1934 m_minMBCharWidth = 0;
1935
1936 // check for charset that represents wchar_t:
1937 if ( ms_wcCharsetName.empty() )
1938 {
1939 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1940
1941 #if wxUSE_FONTMAP
1942 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1943 #else // !wxUSE_FONTMAP
1944 static const wxChar *names_static[] =
1945 {
1946 #if SIZEOF_WCHAR_T == 4
1947 _T("UCS-4"),
1948 #elif SIZEOF_WCHAR_T = 2
1949 _T("UCS-2"),
1950 #endif
1951 NULL
1952 };
1953 const wxChar **names = names_static;
1954 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1955
1956 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1957 {
1958 const wxString nameCS(*names);
1959
1960 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1961 wxString nameXE(nameCS);
1962
1963 #ifdef WORDS_BIGENDIAN
1964 nameXE += _T("BE");
1965 #else // little endian
1966 nameXE += _T("LE");
1967 #endif
1968
1969 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1970 nameXE.c_str());
1971
1972 m2w = iconv_open(nameXE.ToAscii(), name);
1973 if ( m2w == ICONV_T_INVALID )
1974 {
1975 // try charset w/o bytesex info (e.g. "UCS4")
1976 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1977 nameCS.c_str());
1978 m2w = iconv_open(nameCS.ToAscii(), name);
1979
1980 // and check for bytesex ourselves:
1981 if ( m2w != ICONV_T_INVALID )
1982 {
1983 char buf[2], *bufPtr;
1984 wchar_t wbuf[2];
1985 size_t insz, outsz;
1986 size_t res;
1987
1988 buf[0] = 'A';
1989 buf[1] = 0;
1990 wbuf[0] = 0;
1991 insz = 2;
1992 outsz = SIZEOF_WCHAR_T * 2;
1993 char* wbufPtr = (char*)wbuf;
1994 bufPtr = buf;
1995
1996 res = iconv(
1997 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1998 &wbufPtr, &outsz);
1999
2000 if (ICONV_FAILED(res, insz))
2001 {
2002 wxLogLastError(wxT("iconv"));
2003 wxLogError(_("Conversion to charset '%s' doesn't work."),
2004 nameCS.c_str());
2005 }
2006 else // ok, can convert to this encoding, remember it
2007 {
2008 ms_wcCharsetName = nameCS;
2009 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2010 }
2011 }
2012 }
2013 else // use charset not requiring byte swapping
2014 {
2015 ms_wcCharsetName = nameXE;
2016 }
2017 }
2018
2019 wxLogTrace(TRACE_STRCONV,
2020 wxT("iconv wchar_t charset is \"%s\"%s"),
2021 ms_wcCharsetName.empty() ? wxString("<none>")
2022 : ms_wcCharsetName,
2023 ms_wcNeedsSwap ? _T(" (needs swap)")
2024 : _T(""));
2025 }
2026 else // we already have ms_wcCharsetName
2027 {
2028 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2029 }
2030
2031 if ( ms_wcCharsetName.empty() )
2032 {
2033 w2m = ICONV_T_INVALID;
2034 }
2035 else
2036 {
2037 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2038 if ( w2m == ICONV_T_INVALID )
2039 {
2040 wxLogTrace(TRACE_STRCONV,
2041 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2042 ms_wcCharsetName.c_str(), name);
2043 }
2044 }
2045 }
2046
2047 wxMBConv_iconv::~wxMBConv_iconv()
2048 {
2049 if ( m2w != ICONV_T_INVALID )
2050 iconv_close(m2w);
2051 if ( w2m != ICONV_T_INVALID )
2052 iconv_close(w2m);
2053 }
2054
2055 size_t
2056 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2057 const char *src, size_t srcLen) const
2058 {
2059 if ( srcLen == wxNO_LEN )
2060 {
2061 // find the string length: notice that must be done differently for
2062 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2063 // consecutive NULs
2064 const size_t nulLen = GetMBNulLen();
2065 switch ( nulLen )
2066 {
2067 default:
2068 return wxCONV_FAILED;
2069
2070 case 1:
2071 srcLen = strlen(src); // arguably more optimized than our version
2072 break;
2073
2074 case 2:
2075 case 4:
2076 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2077 // but they also have to start at character boundary and not
2078 // span two adjacent characters
2079 const char *p;
2080 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2081 ;
2082 srcLen = p - src;
2083 break;
2084 }
2085
2086 // when we're determining the length of the string ourselves we count
2087 // the terminating NUL(s) as part of it and always NUL-terminate the
2088 // output
2089 srcLen += nulLen;
2090 }
2091
2092 // we express length in the number of (wide) characters but iconv always
2093 // counts buffer sizes it in bytes
2094 dstLen *= SIZEOF_WCHAR_T;
2095
2096 #if wxUSE_THREADS
2097 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2098 // Unfortunately there are a couple of global wxCSConv objects such as
2099 // wxConvLocal that are used all over wx code, so we have to make sure
2100 // the handle is used by at most one thread at the time. Otherwise
2101 // only a few wx classes would be safe to use from non-main threads
2102 // as MB<->WC conversion would fail "randomly".
2103 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2104 #endif // wxUSE_THREADS
2105
2106 size_t res, cres;
2107 const char *pszPtr = src;
2108
2109 if ( dst )
2110 {
2111 char* bufPtr = (char*)dst;
2112
2113 // have destination buffer, convert there
2114 cres = iconv(m2w,
2115 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2116 &bufPtr, &dstLen);
2117 res = dstLen - (dstLen / SIZEOF_WCHAR_T);
2118
2119 if (ms_wcNeedsSwap)
2120 {
2121 // convert to native endianness
2122 for ( unsigned i = 0; i < res; i++ )
2123 dst[i] = WC_BSWAP(dst[i]);
2124 }
2125
2126 // NUL-terminate the string if there is any space left
2127 if (res < dstLen)
2128 dst[res] = 0;
2129 }
2130 else // no destination buffer
2131 {
2132 // convert using temp buffer to calculate the size of the buffer needed
2133 wchar_t tbuf[8];
2134 res = 0;
2135
2136 do
2137 {
2138 char* bufPtr = (char*)tbuf;
2139 dstLen = 8 * SIZEOF_WCHAR_T;
2140
2141 cres = iconv(m2w,
2142 ICONV_CHAR_CAST(&pszPtr), &srcLen,
2143 &bufPtr, &dstLen );
2144
2145 res += 8 - (dstLen / SIZEOF_WCHAR_T);
2146 }
2147 while ((cres == (size_t)-1) && (errno == E2BIG));
2148 }
2149
2150 if (ICONV_FAILED(cres, srcLen))
2151 {
2152 //VS: it is ok if iconv fails, hence trace only
2153 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2154 return wxCONV_FAILED;
2155 }
2156
2157 return res;
2158 }
2159
2160 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2161 const wchar_t *src, size_t srcLen) const
2162 {
2163 #if wxUSE_THREADS
2164 // NB: explained in MB2WC
2165 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2166 #endif
2167
2168 if ( srcLen == wxNO_LEN )
2169 srcLen = wxWcslen(src);
2170
2171 size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2172 size_t outbuflen = dstLen;
2173 size_t res, cres;
2174
2175 wchar_t *tmpbuf = 0;
2176
2177 if (ms_wcNeedsSwap)
2178 {
2179 // need to copy to temp buffer to switch endianness
2180 // (doing WC_BSWAP twice on the original buffer won't help, as it
2181 // could be in read-only memory, or be accessed in some other thread)
2182 tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2183 for ( size_t i = 0; i < srcLen; i++ )
2184 tmpbuf[i] = WC_BSWAP(src[i]);
2185
2186 tmpbuf[srcLen] = L'\0';
2187 src = tmpbuf;
2188 }
2189
2190 char* inbuf = (char*)src;
2191 if ( dst )
2192 {
2193 // have destination buffer, convert there
2194 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2195
2196 res = dstLen - outbuflen;
2197
2198 // NB: iconv was given only wcslen(src) characters on input, and so
2199 // it couldn't convert the trailing zero. Let's do it ourselves
2200 // if there's some room left for it in the output buffer.
2201 if (res < dstLen)
2202 dst[0] = 0;
2203 }
2204 else // no destination buffer
2205 {
2206 // convert using temp buffer to calculate the size of the buffer needed
2207 char tbuf[16];
2208 res = 0;
2209 do
2210 {
2211 dst = tbuf;
2212 outbuflen = 16;
2213
2214 cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2215
2216 res += 16 - outbuflen;
2217 }
2218 while ((cres == (size_t)-1) && (errno == E2BIG));
2219 }
2220
2221 if (ms_wcNeedsSwap)
2222 {
2223 free(tmpbuf);
2224 }
2225
2226 if (ICONV_FAILED(cres, inbuflen))
2227 {
2228 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2229 return wxCONV_FAILED;
2230 }
2231
2232 return res;
2233 }
2234
2235 size_t wxMBConv_iconv::GetMBNulLen() const
2236 {
2237 if ( m_minMBCharWidth == 0 )
2238 {
2239 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2240
2241 #if wxUSE_THREADS
2242 // NB: explained in MB2WC
2243 wxMutexLocker lock(self->m_iconvMutex);
2244 #endif
2245
2246 const wchar_t *wnul = L"";
2247 char buf[8]; // should be enough for NUL in any encoding
2248 size_t inLen = sizeof(wchar_t),
2249 outLen = WXSIZEOF(buf);
2250 char *inBuff = (char *)wnul;
2251 char *outBuff = buf;
2252 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2253 {
2254 self->m_minMBCharWidth = (size_t)-1;
2255 }
2256 else // ok
2257 {
2258 self->m_minMBCharWidth = outBuff - buf;
2259 }
2260 }
2261
2262 return m_minMBCharWidth;
2263 }
2264
2265 #if wxUSE_UNICODE_UTF8
2266 bool wxMBConv_iconv::IsUTF8() const
2267 {
2268 return wxStricmp(m_name, "UTF-8") == 0 ||
2269 wxStricmp(m_name, "UTF8") == 0;
2270 }
2271 #endif
2272
2273 #endif // HAVE_ICONV
2274
2275
2276 // ============================================================================
2277 // Win32 conversion classes
2278 // ============================================================================
2279
2280 #ifdef wxHAVE_WIN32_MB2WC
2281
2282 // from utils.cpp
2283 #if wxUSE_FONTMAP
2284 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2285 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2286 #endif
2287
2288 class wxMBConv_win32 : public wxMBConv
2289 {
2290 public:
2291 wxMBConv_win32()
2292 {
2293 m_CodePage = CP_ACP;
2294 m_minMBCharWidth = 0;
2295 }
2296
2297 wxMBConv_win32(const wxMBConv_win32& conv)
2298 : wxMBConv()
2299 {
2300 m_CodePage = conv.m_CodePage;
2301 m_minMBCharWidth = conv.m_minMBCharWidth;
2302 }
2303
2304 #if wxUSE_FONTMAP
2305 wxMBConv_win32(const char* name)
2306 {
2307 m_CodePage = wxCharsetToCodepage(name);
2308 m_minMBCharWidth = 0;
2309 }
2310
2311 wxMBConv_win32(wxFontEncoding encoding)
2312 {
2313 m_CodePage = wxEncodingToCodepage(encoding);
2314 m_minMBCharWidth = 0;
2315 }
2316 #endif // wxUSE_FONTMAP
2317
2318 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2319 {
2320 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2321 // the behaviour is not compatible with the Unix version (using iconv)
2322 // and break the library itself, e.g. wxTextInputStream::NextChar()
2323 // wouldn't work if reading an incomplete MB char didn't result in an
2324 // error
2325 //
2326 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2327 // Win XP or newer and it is not supported for UTF-[78] so we always
2328 // use our own conversions in this case. See
2329 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2330 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2331 if ( m_CodePage == CP_UTF8 )
2332 {
2333 return wxMBConvUTF8().MB2WC(buf, psz, n);
2334 }
2335
2336 if ( m_CodePage == CP_UTF7 )
2337 {
2338 return wxMBConvUTF7().MB2WC(buf, psz, n);
2339 }
2340
2341 int flags = 0;
2342 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2343 IsAtLeastWin2kSP4() )
2344 {
2345 flags = MB_ERR_INVALID_CHARS;
2346 }
2347
2348 const size_t len = ::MultiByteToWideChar
2349 (
2350 m_CodePage, // code page
2351 flags, // flags: fall on error
2352 psz, // input string
2353 -1, // its length (NUL-terminated)
2354 buf, // output string
2355 buf ? n : 0 // size of output buffer
2356 );
2357 if ( !len )
2358 {
2359 // function totally failed
2360 return wxCONV_FAILED;
2361 }
2362
2363 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2364 // check if we succeeded, by doing a double trip:
2365 if ( !flags && buf )
2366 {
2367 const size_t mbLen = strlen(psz);
2368 wxCharBuffer mbBuf(mbLen);
2369 if ( ::WideCharToMultiByte
2370 (
2371 m_CodePage,
2372 0,
2373 buf,
2374 -1,
2375 mbBuf.data(),
2376 mbLen + 1, // size in bytes, not length
2377 NULL,
2378 NULL
2379 ) == 0 ||
2380 strcmp(mbBuf, psz) != 0 )
2381 {
2382 // we didn't obtain the same thing we started from, hence
2383 // the conversion was lossy and we consider that it failed
2384 return wxCONV_FAILED;
2385 }
2386 }
2387
2388 // note that it returns count of written chars for buf != NULL and size
2389 // of the needed buffer for buf == NULL so in either case the length of
2390 // the string (which never includes the terminating NUL) is one less
2391 return len - 1;
2392 }
2393
2394 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2395 {
2396 /*
2397 we have a problem here: by default, WideCharToMultiByte() may
2398 replace characters unrepresentable in the target code page with bad
2399 quality approximations such as turning "1/2" symbol (U+00BD) into
2400 "1" for the code pages which don't have it and we, obviously, want
2401 to avoid this at any price
2402
2403 the trouble is that this function does it _silently_, i.e. it won't
2404 even tell us whether it did or not... Win98/2000 and higher provide
2405 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2406 we have to resort to a round trip, i.e. check that converting back
2407 results in the same string -- this is, of course, expensive but
2408 otherwise we simply can't be sure to not garble the data.
2409 */
2410
2411 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2412 // it doesn't work with CJK encodings (which we test for rather roughly
2413 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2414 // supporting it
2415 BOOL usedDef wxDUMMY_INITIALIZE(false);
2416 BOOL *pUsedDef;
2417 int flags;
2418 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2419 {
2420 // it's our lucky day
2421 flags = WC_NO_BEST_FIT_CHARS;
2422 pUsedDef = &usedDef;
2423 }
2424 else // old system or unsupported encoding
2425 {
2426 flags = 0;
2427 pUsedDef = NULL;
2428 }
2429
2430 const size_t len = ::WideCharToMultiByte
2431 (
2432 m_CodePage, // code page
2433 flags, // either none or no best fit
2434 pwz, // input string
2435 -1, // it is (wide) NUL-terminated
2436 buf, // output buffer
2437 buf ? n : 0, // and its size
2438 NULL, // default "replacement" char
2439 pUsedDef // [out] was it used?
2440 );
2441
2442 if ( !len )
2443 {
2444 // function totally failed
2445 return wxCONV_FAILED;
2446 }
2447
2448 // we did something, check if we really succeeded
2449 if ( flags )
2450 {
2451 // check if the conversion failed, i.e. if any replacements
2452 // were done
2453 if ( usedDef )
2454 return wxCONV_FAILED;
2455 }
2456 else // we must resort to double tripping...
2457 {
2458 // first we need to ensure that we really have the MB data: this is
2459 // not the case if we're called with NULL buffer, in which case we
2460 // need to do the conversion yet again
2461 wxCharBuffer bufDef;
2462 if ( !buf )
2463 {
2464 bufDef = wxCharBuffer(len);
2465 buf = bufDef.data();
2466 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2467 buf, len, NULL, NULL) )
2468 return wxCONV_FAILED;
2469 }
2470
2471 if ( !n )
2472 n = wcslen(pwz);
2473 wxWCharBuffer wcBuf(n);
2474 if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2475 wcscmp(wcBuf, pwz) != 0 )
2476 {
2477 // we didn't obtain the same thing we started from, hence
2478 // the conversion was lossy and we consider that it failed
2479 return wxCONV_FAILED;
2480 }
2481 }
2482
2483 // see the comment above for the reason of "len - 1"
2484 return len - 1;
2485 }
2486
2487 virtual size_t GetMBNulLen() const
2488 {
2489 if ( m_minMBCharWidth == 0 )
2490 {
2491 int len = ::WideCharToMultiByte
2492 (
2493 m_CodePage, // code page
2494 0, // no flags
2495 L"", // input string
2496 1, // translate just the NUL
2497 NULL, // output buffer
2498 0, // and its size
2499 NULL, // no replacement char
2500 NULL // [out] don't care if it was used
2501 );
2502
2503 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2504 switch ( len )
2505 {
2506 default:
2507 wxLogDebug(_T("Unexpected NUL length %d"), len);
2508 self->m_minMBCharWidth = (size_t)-1;
2509 break;
2510
2511 case 0:
2512 self->m_minMBCharWidth = (size_t)-1;
2513 break;
2514
2515 case 1:
2516 case 2:
2517 case 4:
2518 self->m_minMBCharWidth = len;
2519 break;
2520 }
2521 }
2522
2523 return m_minMBCharWidth;
2524 }
2525
2526 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2527
2528 bool IsOk() const { return m_CodePage != -1; }
2529
2530 private:
2531 static bool CanUseNoBestFit()
2532 {
2533 static int s_isWin98Or2k = -1;
2534
2535 if ( s_isWin98Or2k == -1 )
2536 {
2537 int verMaj, verMin;
2538 switch ( wxGetOsVersion(&verMaj, &verMin) )
2539 {
2540 case wxOS_WINDOWS_9X:
2541 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2542 break;
2543
2544 case wxOS_WINDOWS_NT:
2545 s_isWin98Or2k = verMaj >= 5;
2546 break;
2547
2548 default:
2549 // unknown: be conservative by default
2550 s_isWin98Or2k = 0;
2551 break;
2552 }
2553
2554 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2555 }
2556
2557 return s_isWin98Or2k == 1;
2558 }
2559
2560 static bool IsAtLeastWin2kSP4()
2561 {
2562 #ifdef __WXWINCE__
2563 return false;
2564 #else
2565 static int s_isAtLeastWin2kSP4 = -1;
2566
2567 if ( s_isAtLeastWin2kSP4 == -1 )
2568 {
2569 OSVERSIONINFOEX ver;
2570
2571 memset(&ver, 0, sizeof(ver));
2572 ver.dwOSVersionInfoSize = sizeof(ver);
2573 GetVersionEx((OSVERSIONINFO*)&ver);
2574
2575 s_isAtLeastWin2kSP4 =
2576 ((ver.dwMajorVersion > 5) || // Vista+
2577 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2578 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2579 ver.wServicePackMajor >= 4)) // 2000 SP4+
2580 ? 1 : 0;
2581 }
2582
2583 return s_isAtLeastWin2kSP4 == 1;
2584 #endif
2585 }
2586
2587
2588 // the code page we're working with
2589 long m_CodePage;
2590
2591 // cached result of GetMBNulLen(), set to 0 initially meaning
2592 // "unknown"
2593 size_t m_minMBCharWidth;
2594 };
2595
2596 #endif // wxHAVE_WIN32_MB2WC
2597
2598
2599 // ============================================================================
2600 // wxEncodingConverter based conversion classes
2601 // ============================================================================
2602
2603 #if wxUSE_FONTMAP
2604
2605 class wxMBConv_wxwin : public wxMBConv
2606 {
2607 private:
2608 void Init()
2609 {
2610 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2611 // The wxMBConv_cf class does a better job.
2612 m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2613 m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2614 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2615 }
2616
2617 public:
2618 // temporarily just use wxEncodingConverter stuff,
2619 // so that it works while a better implementation is built
2620 wxMBConv_wxwin(const char* name)
2621 {
2622 if (name)
2623 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2624 else
2625 m_enc = wxFONTENCODING_SYSTEM;
2626
2627 Init();
2628 }
2629
2630 wxMBConv_wxwin(wxFontEncoding enc)
2631 {
2632 m_enc = enc;
2633
2634 Init();
2635 }
2636
2637 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2638 {
2639 size_t inbuf = strlen(psz);
2640 if (buf)
2641 {
2642 if (!m2w.Convert(psz, buf))
2643 return wxCONV_FAILED;
2644 }
2645 return inbuf;
2646 }
2647
2648 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2649 {
2650 const size_t inbuf = wxWcslen(psz);
2651 if (buf)
2652 {
2653 if (!w2m.Convert(psz, buf))
2654 return wxCONV_FAILED;
2655 }
2656
2657 return inbuf;
2658 }
2659
2660 virtual size_t GetMBNulLen() const
2661 {
2662 switch ( m_enc )
2663 {
2664 case wxFONTENCODING_UTF16BE:
2665 case wxFONTENCODING_UTF16LE:
2666 return 2;
2667
2668 case wxFONTENCODING_UTF32BE:
2669 case wxFONTENCODING_UTF32LE:
2670 return 4;
2671
2672 default:
2673 return 1;
2674 }
2675 }
2676
2677 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2678
2679 bool IsOk() const { return m_ok; }
2680
2681 public:
2682 wxFontEncoding m_enc;
2683 wxEncodingConverter m2w, w2m;
2684
2685 private:
2686 // were we initialized successfully?
2687 bool m_ok;
2688
2689 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2690 };
2691
2692 // make the constructors available for unit testing
2693 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2694 {
2695 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2696 if ( !result->IsOk() )
2697 {
2698 delete result;
2699 return 0;
2700 }
2701
2702 return result;
2703 }
2704
2705 #endif // wxUSE_FONTMAP
2706
2707 // ============================================================================
2708 // wxCSConv implementation
2709 // ============================================================================
2710
2711 void wxCSConv::Init()
2712 {
2713 m_name = NULL;
2714 m_convReal = NULL;
2715 m_deferred = true;
2716 }
2717
2718 wxCSConv::wxCSConv(const wxString& charset)
2719 {
2720 Init();
2721
2722 if ( !charset.empty() )
2723 {
2724 SetName(charset.ToAscii());
2725 }
2726
2727 #if wxUSE_FONTMAP
2728 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2729 #else
2730 m_encoding = wxFONTENCODING_SYSTEM;
2731 #endif
2732 }
2733
2734 wxCSConv::wxCSConv(wxFontEncoding encoding)
2735 {
2736 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2737 {
2738 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2739
2740 encoding = wxFONTENCODING_SYSTEM;
2741 }
2742
2743 Init();
2744
2745 m_encoding = encoding;
2746 }
2747
2748 wxCSConv::~wxCSConv()
2749 {
2750 Clear();
2751 }
2752
2753 wxCSConv::wxCSConv(const wxCSConv& conv)
2754 : wxMBConv()
2755 {
2756 Init();
2757
2758 SetName(conv.m_name);
2759 m_encoding = conv.m_encoding;
2760 }
2761
2762 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2763 {
2764 Clear();
2765
2766 SetName(conv.m_name);
2767 m_encoding = conv.m_encoding;
2768
2769 return *this;
2770 }
2771
2772 void wxCSConv::Clear()
2773 {
2774 free(m_name);
2775 delete m_convReal;
2776
2777 m_name = NULL;
2778 m_convReal = NULL;
2779 }
2780
2781 void wxCSConv::SetName(const char *charset)
2782 {
2783 if (charset)
2784 {
2785 m_name = wxStrdup(charset);
2786 m_deferred = true;
2787 }
2788 }
2789
2790 #if wxUSE_FONTMAP
2791
2792 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2793 wxEncodingNameCache );
2794
2795 static wxEncodingNameCache gs_nameCache;
2796 #endif
2797
2798 wxMBConv *wxCSConv::DoCreate() const
2799 {
2800 #if wxUSE_FONTMAP
2801 wxLogTrace(TRACE_STRCONV,
2802 wxT("creating conversion for %s"),
2803 (m_name ? m_name
2804 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2805 #endif // wxUSE_FONTMAP
2806
2807 // check for the special case of ASCII or ISO8859-1 charset: as we have
2808 // special knowledge of it anyhow, we don't need to create a special
2809 // conversion object
2810 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2811 m_encoding == wxFONTENCODING_DEFAULT )
2812 {
2813 // don't convert at all
2814 return NULL;
2815 }
2816
2817 // we trust OS to do conversion better than we can so try external
2818 // conversion methods first
2819 //
2820 // the full order is:
2821 // 1. OS conversion (iconv() under Unix or Win32 API)
2822 // 2. hard coded conversions for UTF
2823 // 3. wxEncodingConverter as fall back
2824
2825 // step (1)
2826 #ifdef HAVE_ICONV
2827 #if !wxUSE_FONTMAP
2828 if ( m_name )
2829 #endif // !wxUSE_FONTMAP
2830 {
2831 #if wxUSE_FONTMAP
2832 wxFontEncoding encoding(m_encoding);
2833 #endif
2834
2835 if ( m_name )
2836 {
2837 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2838 if ( conv->IsOk() )
2839 return conv;
2840
2841 delete conv;
2842
2843 #if wxUSE_FONTMAP
2844 encoding =
2845 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2846 #endif // wxUSE_FONTMAP
2847 }
2848 #if wxUSE_FONTMAP
2849 {
2850 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2851 if ( it != gs_nameCache.end() )
2852 {
2853 if ( it->second.empty() )
2854 return NULL;
2855
2856 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2857 if ( conv->IsOk() )
2858 return conv;
2859
2860 delete conv;
2861 }
2862
2863 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2864 // CS : in case this does not return valid names (eg for MacRoman)
2865 // encoding got a 'failure' entry in the cache all the same,
2866 // although it just has to be created using a different method, so
2867 // only store failed iconv creation attempts (or perhaps we
2868 // shoulnd't do this at all ?)
2869 if ( names[0] != NULL )
2870 {
2871 for ( ; *names; ++names )
2872 {
2873 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2874 // will need changes that will obsolete this
2875 wxString name(*names);
2876 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2877 if ( conv->IsOk() )
2878 {
2879 gs_nameCache[encoding] = *names;
2880 return conv;
2881 }
2882
2883 delete conv;
2884 }
2885
2886 gs_nameCache[encoding] = _T(""); // cache the failure
2887 }
2888 }
2889 #endif // wxUSE_FONTMAP
2890 }
2891 #endif // HAVE_ICONV
2892
2893 #ifdef wxHAVE_WIN32_MB2WC
2894 {
2895 #if wxUSE_FONTMAP
2896 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2897 : new wxMBConv_win32(m_encoding);
2898 if ( conv->IsOk() )
2899 return conv;
2900
2901 delete conv;
2902 #else
2903 return NULL;
2904 #endif
2905 }
2906 #endif // wxHAVE_WIN32_MB2WC
2907
2908 #ifdef __DARWIN__
2909 {
2910 // leave UTF16 and UTF32 to the built-ins of wx
2911 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2912 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2913 {
2914 #if wxUSE_FONTMAP
2915 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2916 : new wxMBConv_cf(m_encoding);
2917 #else
2918 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2919 #endif
2920
2921 if ( conv->IsOk() )
2922 return conv;
2923
2924 delete conv;
2925 }
2926 }
2927 #endif // __DARWIN__
2928
2929 // step (2)
2930 wxFontEncoding enc = m_encoding;
2931 #if wxUSE_FONTMAP
2932 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2933 {
2934 // use "false" to suppress interactive dialogs -- we can be called from
2935 // anywhere and popping up a dialog from here is the last thing we want to
2936 // do
2937 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2938 }
2939 #endif // wxUSE_FONTMAP
2940
2941 switch ( enc )
2942 {
2943 case wxFONTENCODING_UTF7:
2944 return new wxMBConvUTF7;
2945
2946 case wxFONTENCODING_UTF8:
2947 return new wxMBConvUTF8;
2948
2949 case wxFONTENCODING_UTF16BE:
2950 return new wxMBConvUTF16BE;
2951
2952 case wxFONTENCODING_UTF16LE:
2953 return new wxMBConvUTF16LE;
2954
2955 case wxFONTENCODING_UTF32BE:
2956 return new wxMBConvUTF32BE;
2957
2958 case wxFONTENCODING_UTF32LE:
2959 return new wxMBConvUTF32LE;
2960
2961 default:
2962 // nothing to do but put here to suppress gcc warnings
2963 break;
2964 }
2965
2966 // step (3)
2967 #if wxUSE_FONTMAP
2968 {
2969 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2970 : new wxMBConv_wxwin(m_encoding);
2971 if ( conv->IsOk() )
2972 return conv;
2973
2974 delete conv;
2975 }
2976 #endif // wxUSE_FONTMAP
2977
2978 // NB: This is a hack to prevent deadlock. What could otherwise happen
2979 // in Unicode build: wxConvLocal creation ends up being here
2980 // because of some failure and logs the error. But wxLog will try to
2981 // attach a timestamp, for which it will need wxConvLocal (to convert
2982 // time to char* and then wchar_t*), but that fails, tries to log the
2983 // error, but wxLog has an (already locked) critical section that
2984 // guards the static buffer.
2985 static bool alreadyLoggingError = false;
2986 if (!alreadyLoggingError)
2987 {
2988 alreadyLoggingError = true;
2989 wxLogError(_("Cannot convert from the charset '%s'!"),
2990 m_name ? m_name
2991 :
2992 #if wxUSE_FONTMAP
2993 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2994 #else // !wxUSE_FONTMAP
2995 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2996 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2997 );
2998
2999 alreadyLoggingError = false;
3000 }
3001
3002 return NULL;
3003 }
3004
3005 void wxCSConv::CreateConvIfNeeded() const
3006 {
3007 if ( m_deferred )
3008 {
3009 wxCSConv *self = (wxCSConv *)this; // const_cast
3010
3011 // if we don't have neither the name nor the encoding, use the default
3012 // encoding for this system
3013 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3014 {
3015 #if wxUSE_INTL
3016 self->m_encoding = wxLocale::GetSystemEncoding();
3017 #else
3018 // fallback to some reasonable default:
3019 self->m_encoding = wxFONTENCODING_ISO8859_1;
3020 #endif // wxUSE_INTL
3021 }
3022
3023 self->m_convReal = DoCreate();
3024 self->m_deferred = false;
3025 }
3026 }
3027
3028 bool wxCSConv::IsOk() const
3029 {
3030 CreateConvIfNeeded();
3031
3032 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3033 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3034 return true; // always ok as we do it ourselves
3035
3036 // m_convReal->IsOk() is called at its own creation, so we know it must
3037 // be ok if m_convReal is non-NULL
3038 return m_convReal != NULL;
3039 }
3040
3041 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3042 const char *src, size_t srcLen) const
3043 {
3044 CreateConvIfNeeded();
3045
3046 if (m_convReal)
3047 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3048
3049 // latin-1 (direct)
3050 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3051 }
3052
3053 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3054 const wchar_t *src, size_t srcLen) const
3055 {
3056 CreateConvIfNeeded();
3057
3058 if (m_convReal)
3059 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3060
3061 // latin-1 (direct)
3062 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3063 }
3064
3065 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3066 {
3067 CreateConvIfNeeded();
3068
3069 if (m_convReal)
3070 return m_convReal->MB2WC(buf, psz, n);
3071
3072 // latin-1 (direct)
3073 size_t len = strlen(psz);
3074
3075 if (buf)
3076 {
3077 for (size_t c = 0; c <= len; c++)
3078 buf[c] = (unsigned char)(psz[c]);
3079 }
3080
3081 return len;
3082 }
3083
3084 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3085 {
3086 CreateConvIfNeeded();
3087
3088 if (m_convReal)
3089 return m_convReal->WC2MB(buf, psz, n);
3090
3091 // latin-1 (direct)
3092 const size_t len = wxWcslen(psz);
3093 if (buf)
3094 {
3095 for (size_t c = 0; c <= len; c++)
3096 {
3097 if (psz[c] > 0xFF)
3098 return wxCONV_FAILED;
3099
3100 buf[c] = (char)psz[c];
3101 }
3102 }
3103 else
3104 {
3105 for (size_t c = 0; c <= len; c++)
3106 {
3107 if (psz[c] > 0xFF)
3108 return wxCONV_FAILED;
3109 }
3110 }
3111
3112 return len;
3113 }
3114
3115 size_t wxCSConv::GetMBNulLen() const
3116 {
3117 CreateConvIfNeeded();
3118
3119 if ( m_convReal )
3120 {
3121 return m_convReal->GetMBNulLen();
3122 }
3123
3124 // otherwise, we are ISO-8859-1
3125 return 1;
3126 }
3127
3128 #if wxUSE_UNICODE_UTF8
3129 bool wxCSConv::IsUTF8() const
3130 {
3131 CreateConvIfNeeded();
3132
3133 if ( m_convReal )
3134 {
3135 return m_convReal->IsUTF8();
3136 }
3137
3138 // otherwise, we are ISO-8859-1
3139 return false;
3140 }
3141 #endif
3142
3143
3144 #if wxUSE_UNICODE
3145
3146 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3147 {
3148 if ( !s )
3149 return wxWCharBuffer();
3150
3151 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3152 if ( !wbuf )
3153 wbuf = wxMBConvUTF8().cMB2WX(s);
3154 if ( !wbuf )
3155 wbuf = wxConvISO8859_1.cMB2WX(s);
3156
3157 return wbuf;
3158 }
3159
3160 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3161 {
3162 if ( !ws )
3163 return wxCharBuffer();
3164
3165 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3166 if ( !buf )
3167 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3168
3169 return buf;
3170 }
3171
3172 #endif // wxUSE_UNICODE
3173
3174 // ----------------------------------------------------------------------------
3175 // globals
3176 // ----------------------------------------------------------------------------
3177
3178 // NB: The reason why we create converted objects in this convoluted way,
3179 // using a factory function instead of global variable, is that they
3180 // may be used at static initialization time (some of them are used by
3181 // wxString ctors and there may be a global wxString object). In other
3182 // words, possibly _before_ the converter global object would be
3183 // initialized.
3184
3185 #undef wxConvLibc
3186 #undef wxConvUTF8
3187 #undef wxConvUTF7
3188 #undef wxConvLocal
3189 #undef wxConvISO8859_1
3190
3191 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3192 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3193 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3194 { \
3195 static impl_klass name##Obj ctor_args; \
3196 return &name##Obj; \
3197 } \
3198 /* this ensures that all global converter objects are created */ \
3199 /* by the time static initialization is done, i.e. before any */ \
3200 /* thread is launched: */ \
3201 static klass* gs_##name##instance = wxGet_##name##Ptr()
3202
3203 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3204 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3205
3206 #ifdef __WINDOWS__
3207 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3208 #else
3209 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3210 #endif
3211
3212 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3213 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3214 // provokes an error message about "not enough macro parameters"; and we
3215 // can't use "()" here as the name##Obj declaration would be parsed as a
3216 // function declaration then, so use a semicolon and live with an extra
3217 // empty statement (and hope that no compilers warns about this)
3218 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3219 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3220
3221 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3222 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3223
3224 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3225 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3226
3227 #ifdef __DARWIN__
3228 // The xnu kernel always communicates file paths in decomposed UTF-8.
3229 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3230 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3231 #endif
3232
3233 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3234 #ifdef __DARWIN__
3235 &wxConvMacUTF8DObj;
3236 #else // !__DARWIN__
3237 wxGet_wxConvLibcPtr();
3238 #endif // __DARWIN__/!__DARWIN__
3239
3240 #else // !wxUSE_WCHAR_T
3241
3242 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3243 // stand-ins in absence of wchar_t
3244 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3245 wxConvISO8859_1,
3246 wxConvLocal,
3247 wxConvUTF8;
3248
3249 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T