Include wx/hashmap.h according to precompiled headers of wx/wx.h (with other minor...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifndef WX_PRECOMP
19 #include "wx/intl.h"
20 #include "wx/log.h"
21 #include "wx/utils.h"
22 #include "wx/hashmap.h"
23 #endif
24
25 #include "wx/strconv.h"
26
27 #if wxUSE_WCHAR_T
28
29 #ifdef __WINDOWS__
30 #include "wx/msw/private.h"
31 #include "wx/msw/missing.h"
32 #endif
33
34 #ifndef __WXWINCE__
35 #include <errno.h>
36 #endif
37
38 #include <ctype.h>
39 #include <string.h>
40 #include <stdlib.h>
41
42 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
43 #define wxHAVE_WIN32_MB2WC
44 #endif
45
46 #ifdef __SALFORDC__
47 #include <clib.h>
48 #endif
49
50 #ifdef HAVE_ICONV
51 #include <iconv.h>
52 #include "wx/thread.h"
53 #endif
54
55 #include "wx/encconv.h"
56 #include "wx/fontmap.h"
57
58 #ifdef __WXMAC__
59 #ifndef __DARWIN__
60 #include <ATSUnicode.h>
61 #include <TextCommon.h>
62 #include <TextEncodingConverter.h>
63 #endif
64
65 // includes Mac headers
66 #include "wx/mac/private.h"
67 #endif
68
69
70 #define TRACE_STRCONV _T("strconv")
71
72 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
73 // be 4 bytes
74 #if SIZEOF_WCHAR_T == 2
75 #define WC_UTF16
76 #endif
77
78
79 // ============================================================================
80 // implementation
81 // ============================================================================
82
83 // helper function of cMB2WC(): check if n bytes at this location are all NUL
84 static bool NotAllNULs(const char *p, size_t n)
85 {
86 while ( n && *p++ == '\0' )
87 n--;
88
89 return n != 0;
90 }
91
92 // ----------------------------------------------------------------------------
93 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
94 // ----------------------------------------------------------------------------
95
96 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
97 {
98 if (input <= 0xffff)
99 {
100 if (output)
101 *output = (wxUint16) input;
102
103 return 1;
104 }
105 else if (input >= 0x110000)
106 {
107 return wxCONV_FAILED;
108 }
109 else
110 {
111 if (output)
112 {
113 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
114 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
115 }
116
117 return 2;
118 }
119 }
120
121 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
122 {
123 if ((*input < 0xd800) || (*input > 0xdfff))
124 {
125 output = *input;
126 return 1;
127 }
128 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
129 {
130 output = *input;
131 return wxCONV_FAILED;
132 }
133 else
134 {
135 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
136 return 2;
137 }
138 }
139
140 #ifdef WC_UTF16
141 typedef wchar_t wxDecodeSurrogate_t;
142 #else // !WC_UTF16
143 typedef wxUint16 wxDecodeSurrogate_t;
144 #endif // WC_UTF16/!WC_UTF16
145
146 // returns the next UTF-32 character from the wchar_t buffer and advances the
147 // pointer to the character after this one
148 //
149 // if an invalid character is found, *pSrc is set to NULL, the caller must
150 // check for this
151 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
152 {
153 wxUint32 out;
154 const size_t
155 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
156 if ( n == wxCONV_FAILED )
157 *pSrc = NULL;
158 else
159 *pSrc += n;
160
161 return out;
162 }
163
164 // ----------------------------------------------------------------------------
165 // wxMBConv
166 // ----------------------------------------------------------------------------
167
168 size_t
169 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
170 const char *src, size_t srcLen) const
171 {
172 // although new conversion classes are supposed to implement this function
173 // directly, the existins ones only implement the old MB2WC() and so, to
174 // avoid to have to rewrite all conversion classes at once, we provide a
175 // default (but not efficient) implementation of this one in terms of the
176 // old function by copying the input to ensure that it's NUL-terminated and
177 // then using MB2WC() to convert it
178
179 // the number of chars [which would be] written to dst [if it were not NULL]
180 size_t dstWritten = 0;
181
182 // the number of NULs terminating this string
183 size_t nulLen = 0; // not really needed, but just to avoid warnings
184
185 // if we were not given the input size we just have to assume that the
186 // string is properly terminated as we have no way of knowing how long it
187 // is anyhow, but if we do have the size check whether there are enough
188 // NULs at the end
189 wxCharBuffer bufTmp;
190 const char *srcEnd;
191 if ( srcLen != wxNO_LEN )
192 {
193 // we need to know how to find the end of this string
194 nulLen = GetMBNulLen();
195 if ( nulLen == wxCONV_FAILED )
196 return wxCONV_FAILED;
197
198 // if there are enough NULs we can avoid the copy
199 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
200 {
201 // make a copy in order to properly NUL-terminate the string
202 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
203 char * const p = bufTmp.data();
204 memcpy(p, src, srcLen);
205 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
206 *s = '\0';
207
208 src = bufTmp;
209 }
210
211 srcEnd = src + srcLen;
212 }
213 else // quit after the first loop iteration
214 {
215 srcEnd = NULL;
216 }
217
218 for ( ;; )
219 {
220 // try to convert the current chunk
221 size_t lenChunk = MB2WC(NULL, src, 0);
222 if ( lenChunk == wxCONV_FAILED )
223 return wxCONV_FAILED;
224
225 lenChunk++; // for the L'\0' at the end of this chunk
226
227 dstWritten += lenChunk;
228
229 if ( lenChunk == 1 )
230 {
231 // nothing left in the input string, conversion succeeded
232 break;
233 }
234
235 if ( dst )
236 {
237 if ( dstWritten > dstLen )
238 return wxCONV_FAILED;
239
240 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
241 return wxCONV_FAILED;
242
243 dst += lenChunk;
244 }
245
246 if ( !srcEnd )
247 {
248 // we convert just one chunk in this case as this is the entire
249 // string anyhow
250 break;
251 }
252
253 // advance the input pointer past the end of this chunk
254 while ( NotAllNULs(src, nulLen) )
255 {
256 // notice that we must skip over multiple bytes here as we suppose
257 // that if NUL takes 2 or 4 bytes, then all the other characters do
258 // too and so if advanced by a single byte we might erroneously
259 // detect sequences of NUL bytes in the middle of the input
260 src += nulLen;
261 }
262
263 src += nulLen; // skipping over its terminator as well
264
265 // note that ">=" (and not just "==") is needed here as the terminator
266 // we skipped just above could be inside or just after the buffer
267 // delimited by inEnd
268 if ( src >= srcEnd )
269 break;
270 }
271
272 return dstWritten;
273 }
274
275 size_t
276 wxMBConv::FromWChar(char *dst, size_t dstLen,
277 const wchar_t *src, size_t srcLen) const
278 {
279 // the number of chars [which would be] written to dst [if it were not NULL]
280 size_t dstWritten = 0;
281
282 // make a copy of the input string unless it is already properly
283 // NUL-terminated
284 //
285 // if we don't know its length we have no choice but to assume that it is,
286 // indeed, properly terminated
287 wxWCharBuffer bufTmp;
288 if ( srcLen == wxNO_LEN )
289 {
290 srcLen = wxWcslen(src) + 1;
291 }
292 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
293 {
294 // make a copy in order to properly NUL-terminate the string
295 bufTmp = wxWCharBuffer(srcLen);
296 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
297 src = bufTmp;
298 }
299
300 const size_t lenNul = GetMBNulLen();
301 for ( const wchar_t * const srcEnd = src + srcLen;
302 src < srcEnd;
303 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
304 {
305 // try to convert the current chunk
306 size_t lenChunk = WC2MB(NULL, src, 0);
307
308 if ( lenChunk == wxCONV_FAILED )
309 return wxCONV_FAILED;
310
311 lenChunk += lenNul;
312 dstWritten += lenChunk;
313
314 if ( dst )
315 {
316 if ( dstWritten > dstLen )
317 return wxCONV_FAILED;
318
319 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
320 return wxCONV_FAILED;
321
322 dst += lenChunk;
323 }
324 }
325
326 return dstWritten;
327 }
328
329 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
330 {
331 size_t rc = ToWChar(outBuff, outLen, inBuff);
332 if ( rc != wxCONV_FAILED )
333 {
334 // ToWChar() returns the buffer length, i.e. including the trailing
335 // NUL, while this method doesn't take it into account
336 rc--;
337 }
338
339 return rc;
340 }
341
342 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
343 {
344 size_t rc = FromWChar(outBuff, outLen, inBuff);
345 if ( rc != wxCONV_FAILED )
346 {
347 rc -= GetMBNulLen();
348 }
349
350 return rc;
351 }
352
353 wxMBConv::~wxMBConv()
354 {
355 // nothing to do here (necessary for Darwin linking probably)
356 }
357
358 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
359 {
360 if ( psz )
361 {
362 // calculate the length of the buffer needed first
363 const size_t nLen = MB2WC(NULL, psz, 0);
364 if ( nLen != wxCONV_FAILED )
365 {
366 // now do the actual conversion
367 wxWCharBuffer buf(nLen /* +1 added implicitly */);
368
369 // +1 for the trailing NULL
370 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
371 return buf;
372 }
373 }
374
375 return wxWCharBuffer();
376 }
377
378 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
379 {
380 if ( pwz )
381 {
382 const size_t nLen = WC2MB(NULL, pwz, 0);
383 if ( nLen != wxCONV_FAILED )
384 {
385 // extra space for trailing NUL(s)
386 static const size_t extraLen = GetMaxMBNulLen();
387
388 wxCharBuffer buf(nLen + extraLen - 1);
389 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
390 return buf;
391 }
392 }
393
394 return wxCharBuffer();
395 }
396
397 const wxWCharBuffer
398 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
399 {
400 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
401 if ( dstLen != wxCONV_FAILED )
402 {
403 wxWCharBuffer wbuf(dstLen - 1);
404 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
405 {
406 if ( outLen )
407 {
408 *outLen = dstLen;
409 if ( wbuf[dstLen - 1] == L'\0' )
410 (*outLen)--;
411 }
412
413 return wbuf;
414 }
415 }
416
417 if ( outLen )
418 *outLen = 0;
419
420 return wxWCharBuffer();
421 }
422
423 const wxCharBuffer
424 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
425 {
426 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
427 if ( dstLen != wxCONV_FAILED )
428 {
429 // special case of empty input: can't allocate 0 size buffer below as
430 // wxCharBuffer insists on NUL-terminating it
431 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
432 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
433 {
434 if ( outLen )
435 {
436 *outLen = dstLen;
437
438 const size_t nulLen = GetMBNulLen();
439 if ( dstLen >= nulLen &&
440 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
441 {
442 // in this case the output is NUL-terminated and we're not
443 // supposed to count NUL
444 *outLen -= nulLen;
445 }
446 }
447
448 return buf;
449 }
450 }
451
452 if ( outLen )
453 *outLen = 0;
454
455 return wxCharBuffer();
456 }
457
458 // ----------------------------------------------------------------------------
459 // wxMBConvLibc
460 // ----------------------------------------------------------------------------
461
462 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
463 {
464 return wxMB2WC(buf, psz, n);
465 }
466
467 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
468 {
469 return wxWC2MB(buf, psz, n);
470 }
471
472 // ----------------------------------------------------------------------------
473 // wxConvBrokenFileNames
474 // ----------------------------------------------------------------------------
475
476 #ifdef __UNIX__
477
478 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
479 {
480 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
481 || wxStricmp(charset, _T("UTF8")) == 0 )
482 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
483 else
484 m_conv = new wxCSConv(charset);
485 }
486
487 #endif // __UNIX__
488
489 // ----------------------------------------------------------------------------
490 // UTF-7
491 // ----------------------------------------------------------------------------
492
493 // Implementation (C) 2004 Fredrik Roubert
494
495 //
496 // BASE64 decoding table
497 //
498 static const unsigned char utf7unb64[] =
499 {
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
506 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
507 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
509 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
510 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
511 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
513 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
514 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
515 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
532 };
533
534 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
535 {
536 size_t len = 0;
537
538 while ( *psz && (!buf || (len < n)) )
539 {
540 unsigned char cc = *psz++;
541 if (cc != '+')
542 {
543 // plain ASCII char
544 if (buf)
545 *buf++ = cc;
546 len++;
547 }
548 else if (*psz == '-')
549 {
550 // encoded plus sign
551 if (buf)
552 *buf++ = cc;
553 len++;
554 psz++;
555 }
556 else // start of BASE64 encoded string
557 {
558 bool lsb, ok;
559 unsigned int d, l;
560 for ( ok = lsb = false, d = 0, l = 0;
561 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
562 psz++ )
563 {
564 d <<= 6;
565 d += cc;
566 for (l += 6; l >= 8; lsb = !lsb)
567 {
568 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
569 if (lsb)
570 {
571 if (buf)
572 *buf++ |= c;
573 len ++;
574 }
575 else
576 {
577 if (buf)
578 *buf = (wchar_t)(c << 8);
579 }
580
581 ok = true;
582 }
583 }
584
585 if ( !ok )
586 {
587 // in valid UTF7 we should have valid characters after '+'
588 return wxCONV_FAILED;
589 }
590
591 if (*psz == '-')
592 psz++;
593 }
594 }
595
596 if ( buf && (len < n) )
597 *buf = '\0';
598
599 return len;
600 }
601
602 //
603 // BASE64 encoding table
604 //
605 static const unsigned char utf7enb64[] =
606 {
607 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
608 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
609 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
610 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
611 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
612 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
613 'w', 'x', 'y', 'z', '0', '1', '2', '3',
614 '4', '5', '6', '7', '8', '9', '+', '/'
615 };
616
617 //
618 // UTF-7 encoding table
619 //
620 // 0 - Set D (directly encoded characters)
621 // 1 - Set O (optional direct characters)
622 // 2 - whitespace characters (optional)
623 // 3 - special characters
624 //
625 static const unsigned char utf7encode[128] =
626 {
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
628 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
629 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
631 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
633 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
635 };
636
637 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
638 {
639 size_t len = 0;
640
641 while (*psz && ((!buf) || (len < n)))
642 {
643 wchar_t cc = *psz++;
644 if (cc < 0x80 && utf7encode[cc] < 1)
645 {
646 // plain ASCII char
647 if (buf)
648 *buf++ = (char)cc;
649
650 len++;
651 }
652 #ifndef WC_UTF16
653 else if (((wxUint32)cc) > 0xffff)
654 {
655 // no surrogate pair generation (yet?)
656 return wxCONV_FAILED;
657 }
658 #endif
659 else
660 {
661 if (buf)
662 *buf++ = '+';
663
664 len++;
665 if (cc != '+')
666 {
667 // BASE64 encode string
668 unsigned int lsb, d, l;
669 for (d = 0, l = 0; /*nothing*/; psz++)
670 {
671 for (lsb = 0; lsb < 2; lsb ++)
672 {
673 d <<= 8;
674 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
675
676 for (l += 8; l >= 6; )
677 {
678 l -= 6;
679 if (buf)
680 *buf++ = utf7enb64[(d >> l) % 64];
681 len++;
682 }
683 }
684
685 cc = *psz;
686 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
687 break;
688 }
689
690 if (l != 0)
691 {
692 if (buf)
693 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
694
695 len++;
696 }
697 }
698
699 if (buf)
700 *buf++ = '-';
701 len++;
702 }
703 }
704
705 if (buf && (len < n))
706 *buf = 0;
707
708 return len;
709 }
710
711 // ----------------------------------------------------------------------------
712 // UTF-8
713 // ----------------------------------------------------------------------------
714
715 static wxUint32 utf8_max[]=
716 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
717
718 // boundaries of the private use area we use to (temporarily) remap invalid
719 // characters invalid in a UTF-8 encoded string
720 const wxUint32 wxUnicodePUA = 0x100000;
721 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
722
723 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
724 {
725 size_t len = 0;
726
727 while (*psz && ((!buf) || (len < n)))
728 {
729 const char *opsz = psz;
730 bool invalid = false;
731 unsigned char cc = *psz++, fc = cc;
732 unsigned cnt;
733 for (cnt = 0; fc & 0x80; cnt++)
734 fc <<= 1;
735
736 if (!cnt)
737 {
738 // plain ASCII char
739 if (buf)
740 *buf++ = cc;
741 len++;
742
743 // escape the escape character for octal escapes
744 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
745 && cc == '\\' && (!buf || len < n))
746 {
747 if (buf)
748 *buf++ = cc;
749 len++;
750 }
751 }
752 else
753 {
754 cnt--;
755 if (!cnt)
756 {
757 // invalid UTF-8 sequence
758 invalid = true;
759 }
760 else
761 {
762 unsigned ocnt = cnt - 1;
763 wxUint32 res = cc & (0x3f >> cnt);
764 while (cnt--)
765 {
766 cc = *psz;
767 if ((cc & 0xC0) != 0x80)
768 {
769 // invalid UTF-8 sequence
770 invalid = true;
771 break;
772 }
773
774 psz++;
775 res = (res << 6) | (cc & 0x3f);
776 }
777
778 if (invalid || res <= utf8_max[ocnt])
779 {
780 // illegal UTF-8 encoding
781 invalid = true;
782 }
783 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
784 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
785 {
786 // if one of our PUA characters turns up externally
787 // it must also be treated as an illegal sequence
788 // (a bit like you have to escape an escape character)
789 invalid = true;
790 }
791 else
792 {
793 #ifdef WC_UTF16
794 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
795 size_t pa = encode_utf16(res, (wxUint16 *)buf);
796 if (pa == wxCONV_FAILED)
797 {
798 invalid = true;
799 }
800 else
801 {
802 if (buf)
803 buf += pa;
804 len += pa;
805 }
806 #else // !WC_UTF16
807 if (buf)
808 *buf++ = (wchar_t)res;
809 len++;
810 #endif // WC_UTF16/!WC_UTF16
811 }
812 }
813
814 if (invalid)
815 {
816 if (m_options & MAP_INVALID_UTF8_TO_PUA)
817 {
818 while (opsz < psz && (!buf || len < n))
819 {
820 #ifdef WC_UTF16
821 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
822 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
823 wxASSERT(pa != wxCONV_FAILED);
824 if (buf)
825 buf += pa;
826 opsz++;
827 len += pa;
828 #else
829 if (buf)
830 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
831 opsz++;
832 len++;
833 #endif
834 }
835 }
836 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
837 {
838 while (opsz < psz && (!buf || len < n))
839 {
840 if ( buf && len + 3 < n )
841 {
842 unsigned char on = *opsz;
843 *buf++ = L'\\';
844 *buf++ = (wchar_t)( L'0' + on / 0100 );
845 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
846 *buf++ = (wchar_t)( L'0' + on % 010 );
847 }
848
849 opsz++;
850 len += 4;
851 }
852 }
853 else // MAP_INVALID_UTF8_NOT
854 {
855 return wxCONV_FAILED;
856 }
857 }
858 }
859 }
860
861 if (buf && (len < n))
862 *buf = 0;
863
864 return len;
865 }
866
867 static inline bool isoctal(wchar_t wch)
868 {
869 return L'0' <= wch && wch <= L'7';
870 }
871
872 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
873 {
874 size_t len = 0;
875
876 while (*psz && ((!buf) || (len < n)))
877 {
878 wxUint32 cc;
879
880 #ifdef WC_UTF16
881 // cast is ok for WC_UTF16
882 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
883 psz += (pa == wxCONV_FAILED) ? 1 : pa;
884 #else
885 cc = (*psz++) & 0x7fffffff;
886 #endif
887
888 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
889 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
890 {
891 if (buf)
892 *buf++ = (char)(cc - wxUnicodePUA);
893 len++;
894 }
895 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
896 && cc == L'\\' && psz[0] == L'\\' )
897 {
898 if (buf)
899 *buf++ = (char)cc;
900 psz++;
901 len++;
902 }
903 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
904 cc == L'\\' &&
905 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
906 {
907 if (buf)
908 {
909 *buf++ = (char) ((psz[0] - L'0') * 0100 +
910 (psz[1] - L'0') * 010 +
911 (psz[2] - L'0'));
912 }
913
914 psz += 3;
915 len++;
916 }
917 else
918 {
919 unsigned cnt;
920 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
921 {
922 }
923
924 if (!cnt)
925 {
926 // plain ASCII char
927 if (buf)
928 *buf++ = (char) cc;
929 len++;
930 }
931 else
932 {
933 len += cnt + 1;
934 if (buf)
935 {
936 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
937 while (cnt--)
938 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
939 }
940 }
941 }
942 }
943
944 if (buf && (len < n))
945 *buf = 0;
946
947 return len;
948 }
949
950 // ============================================================================
951 // UTF-16
952 // ============================================================================
953
954 #ifdef WORDS_BIGENDIAN
955 #define wxMBConvUTF16straight wxMBConvUTF16BE
956 #define wxMBConvUTF16swap wxMBConvUTF16LE
957 #else
958 #define wxMBConvUTF16swap wxMBConvUTF16BE
959 #define wxMBConvUTF16straight wxMBConvUTF16LE
960 #endif
961
962 /* static */
963 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
964 {
965 if ( srcLen == wxNO_LEN )
966 {
967 // count the number of bytes in input, including the trailing NULs
968 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
969 for ( srcLen = 1; *inBuff++; srcLen++ )
970 ;
971
972 srcLen *= BYTES_PER_CHAR;
973 }
974 else // we already have the length
975 {
976 // we can only convert an entire number of UTF-16 characters
977 if ( srcLen % BYTES_PER_CHAR )
978 return wxCONV_FAILED;
979 }
980
981 return srcLen;
982 }
983
984 // case when in-memory representation is UTF-16 too
985 #ifdef WC_UTF16
986
987 // ----------------------------------------------------------------------------
988 // conversions without endianness change
989 // ----------------------------------------------------------------------------
990
991 size_t
992 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
993 const char *src, size_t srcLen) const
994 {
995 // set up the scene for using memcpy() (which is presumably more efficient
996 // than copying the bytes one by one)
997 srcLen = GetLength(src, srcLen);
998 if ( srcLen == wxNO_LEN )
999 return wxCONV_FAILED;
1000
1001 const size_t inLen = srcLen / BYTES_PER_CHAR;
1002 if ( dst )
1003 {
1004 if ( dstLen < inLen )
1005 return wxCONV_FAILED;
1006
1007 memcpy(dst, src, srcLen);
1008 }
1009
1010 return inLen;
1011 }
1012
1013 size_t
1014 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1015 const wchar_t *src, size_t srcLen) const
1016 {
1017 if ( srcLen == wxNO_LEN )
1018 srcLen = wxWcslen(src) + 1;
1019
1020 srcLen *= BYTES_PER_CHAR;
1021
1022 if ( dst )
1023 {
1024 if ( dstLen < srcLen )
1025 return wxCONV_FAILED;
1026
1027 memcpy(dst, src, srcLen);
1028 }
1029
1030 return srcLen;
1031 }
1032
1033 // ----------------------------------------------------------------------------
1034 // endian-reversing conversions
1035 // ----------------------------------------------------------------------------
1036
1037 size_t
1038 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1039 const char *src, size_t srcLen) const
1040 {
1041 srcLen = GetLength(src, srcLen);
1042 if ( srcLen == wxNO_LEN )
1043 return wxCONV_FAILED;
1044
1045 srcLen /= BYTES_PER_CHAR;
1046
1047 if ( dst )
1048 {
1049 if ( dstLen < srcLen )
1050 return wxCONV_FAILED;
1051
1052 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1053 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1054 {
1055 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1056 }
1057 }
1058
1059 return srcLen;
1060 }
1061
1062 size_t
1063 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1064 const wchar_t *src, size_t srcLen) const
1065 {
1066 if ( srcLen == wxNO_LEN )
1067 srcLen = wxWcslen(src) + 1;
1068
1069 srcLen *= BYTES_PER_CHAR;
1070
1071 if ( dst )
1072 {
1073 if ( dstLen < srcLen )
1074 return wxCONV_FAILED;
1075
1076 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1077 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1078 {
1079 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1080 }
1081 }
1082
1083 return srcLen;
1084 }
1085
1086 #else // !WC_UTF16: wchar_t is UTF-32
1087
1088 // ----------------------------------------------------------------------------
1089 // conversions without endianness change
1090 // ----------------------------------------------------------------------------
1091
1092 size_t
1093 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1094 const char *src, size_t srcLen) const
1095 {
1096 srcLen = GetLength(src, srcLen);
1097 if ( srcLen == wxNO_LEN )
1098 return wxCONV_FAILED;
1099
1100 const size_t inLen = srcLen / BYTES_PER_CHAR;
1101 if ( !dst )
1102 {
1103 // optimization: return maximal space which could be needed for this
1104 // string even if the real size could be smaller if the buffer contains
1105 // any surrogates
1106 return inLen;
1107 }
1108
1109 size_t outLen = 0;
1110 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1111 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1112 {
1113 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1114 if ( !inBuff )
1115 return wxCONV_FAILED;
1116
1117 if ( ++outLen > dstLen )
1118 return wxCONV_FAILED;
1119
1120 *dst++ = ch;
1121 }
1122
1123
1124 return outLen;
1125 }
1126
1127 size_t
1128 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1129 const wchar_t *src, size_t srcLen) const
1130 {
1131 if ( srcLen == wxNO_LEN )
1132 srcLen = wxWcslen(src) + 1;
1133
1134 size_t outLen = 0;
1135 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1136 for ( size_t n = 0; n < srcLen; n++ )
1137 {
1138 wxUint16 cc[2];
1139 const size_t numChars = encode_utf16(*src++, cc);
1140 if ( numChars == wxCONV_FAILED )
1141 return wxCONV_FAILED;
1142
1143 outLen += numChars * BYTES_PER_CHAR;
1144 if ( outBuff )
1145 {
1146 if ( outLen > dstLen )
1147 return wxCONV_FAILED;
1148
1149 *outBuff++ = cc[0];
1150 if ( numChars == 2 )
1151 {
1152 // second character of a surrogate
1153 *outBuff++ = cc[1];
1154 }
1155 }
1156 }
1157
1158 return outLen;
1159 }
1160
1161 // ----------------------------------------------------------------------------
1162 // endian-reversing conversions
1163 // ----------------------------------------------------------------------------
1164
1165 size_t
1166 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1167 const char *src, size_t srcLen) const
1168 {
1169 srcLen = GetLength(src, srcLen);
1170 if ( srcLen == wxNO_LEN )
1171 return wxCONV_FAILED;
1172
1173 const size_t inLen = srcLen / BYTES_PER_CHAR;
1174 if ( !dst )
1175 {
1176 // optimization: return maximal space which could be needed for this
1177 // string even if the real size could be smaller if the buffer contains
1178 // any surrogates
1179 return inLen;
1180 }
1181
1182 size_t outLen = 0;
1183 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1184 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1185 {
1186 wxUint32 ch;
1187 wxUint16 tmp[2];
1188
1189 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1190 inBuff++;
1191 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1192
1193 const size_t numChars = decode_utf16(tmp, ch);
1194 if ( numChars == wxCONV_FAILED )
1195 return wxCONV_FAILED;
1196
1197 if ( numChars == 2 )
1198 inBuff++;
1199
1200 if ( ++outLen > dstLen )
1201 return wxCONV_FAILED;
1202
1203 *dst++ = ch;
1204 }
1205
1206
1207 return outLen;
1208 }
1209
1210 size_t
1211 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1212 const wchar_t *src, size_t srcLen) const
1213 {
1214 if ( srcLen == wxNO_LEN )
1215 srcLen = wxWcslen(src) + 1;
1216
1217 size_t outLen = 0;
1218 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1219 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1220 {
1221 wxUint16 cc[2];
1222 const size_t numChars = encode_utf16(*src, cc);
1223 if ( numChars == wxCONV_FAILED )
1224 return wxCONV_FAILED;
1225
1226 outLen += numChars * BYTES_PER_CHAR;
1227 if ( outBuff )
1228 {
1229 if ( outLen > dstLen )
1230 return wxCONV_FAILED;
1231
1232 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1233 if ( numChars == 2 )
1234 {
1235 // second character of a surrogate
1236 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1237 }
1238 }
1239 }
1240
1241 return outLen;
1242 }
1243
1244 #endif // WC_UTF16/!WC_UTF16
1245
1246
1247 // ============================================================================
1248 // UTF-32
1249 // ============================================================================
1250
1251 #ifdef WORDS_BIGENDIAN
1252 #define wxMBConvUTF32straight wxMBConvUTF32BE
1253 #define wxMBConvUTF32swap wxMBConvUTF32LE
1254 #else
1255 #define wxMBConvUTF32swap wxMBConvUTF32BE
1256 #define wxMBConvUTF32straight wxMBConvUTF32LE
1257 #endif
1258
1259
1260 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1261 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1262
1263 /* static */
1264 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1265 {
1266 if ( srcLen == wxNO_LEN )
1267 {
1268 // count the number of bytes in input, including the trailing NULs
1269 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1270 for ( srcLen = 1; *inBuff++; srcLen++ )
1271 ;
1272
1273 srcLen *= BYTES_PER_CHAR;
1274 }
1275 else // we already have the length
1276 {
1277 // we can only convert an entire number of UTF-32 characters
1278 if ( srcLen % BYTES_PER_CHAR )
1279 return wxCONV_FAILED;
1280 }
1281
1282 return srcLen;
1283 }
1284
1285 // case when in-memory representation is UTF-16
1286 #ifdef WC_UTF16
1287
1288 // ----------------------------------------------------------------------------
1289 // conversions without endianness change
1290 // ----------------------------------------------------------------------------
1291
1292 size_t
1293 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1294 const char *src, size_t srcLen) const
1295 {
1296 srcLen = GetLength(src, srcLen);
1297 if ( srcLen == wxNO_LEN )
1298 return wxCONV_FAILED;
1299
1300 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1301 const size_t inLen = srcLen / BYTES_PER_CHAR;
1302 size_t outLen = 0;
1303 for ( size_t n = 0; n < inLen; n++ )
1304 {
1305 wxUint16 cc[2];
1306 const size_t numChars = encode_utf16(*inBuff++, cc);
1307 if ( numChars == wxCONV_FAILED )
1308 return wxCONV_FAILED;
1309
1310 outLen += numChars;
1311 if ( dst )
1312 {
1313 if ( outLen > dstLen )
1314 return wxCONV_FAILED;
1315
1316 *dst++ = cc[0];
1317 if ( numChars == 2 )
1318 {
1319 // second character of a surrogate
1320 *dst++ = cc[1];
1321 }
1322 }
1323 }
1324
1325 return outLen;
1326 }
1327
1328 size_t
1329 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1330 const wchar_t *src, size_t srcLen) const
1331 {
1332 if ( srcLen == wxNO_LEN )
1333 srcLen = wxWcslen(src) + 1;
1334
1335 if ( !dst )
1336 {
1337 // optimization: return maximal space which could be needed for this
1338 // string instead of the exact amount which could be less if there are
1339 // any surrogates in the input
1340 //
1341 // we consider that surrogates are rare enough to make it worthwhile to
1342 // avoid running the loop below at the cost of slightly extra memory
1343 // consumption
1344 return srcLen * BYTES_PER_CHAR;
1345 }
1346
1347 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1348 size_t outLen = 0;
1349 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1350 {
1351 const wxUint32 ch = wxDecodeSurrogate(&src);
1352 if ( !src )
1353 return wxCONV_FAILED;
1354
1355 outLen += BYTES_PER_CHAR;
1356
1357 if ( outLen > dstLen )
1358 return wxCONV_FAILED;
1359
1360 *outBuff++ = ch;
1361 }
1362
1363 return outLen;
1364 }
1365
1366 // ----------------------------------------------------------------------------
1367 // endian-reversing conversions
1368 // ----------------------------------------------------------------------------
1369
1370 size_t
1371 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1372 const char *src, size_t srcLen) const
1373 {
1374 srcLen = GetLength(src, srcLen);
1375 if ( srcLen == wxNO_LEN )
1376 return wxCONV_FAILED;
1377
1378 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1379 const size_t inLen = srcLen / BYTES_PER_CHAR;
1380 size_t outLen = 0;
1381 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1382 {
1383 wxUint16 cc[2];
1384 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1385 if ( numChars == wxCONV_FAILED )
1386 return wxCONV_FAILED;
1387
1388 outLen += numChars;
1389 if ( dst )
1390 {
1391 if ( outLen > dstLen )
1392 return wxCONV_FAILED;
1393
1394 *dst++ = cc[0];
1395 if ( numChars == 2 )
1396 {
1397 // second character of a surrogate
1398 *dst++ = cc[1];
1399 }
1400 }
1401 }
1402
1403 return outLen;
1404 }
1405
1406 size_t
1407 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1408 const wchar_t *src, size_t srcLen) const
1409 {
1410 if ( srcLen == wxNO_LEN )
1411 srcLen = wxWcslen(src) + 1;
1412
1413 if ( !dst )
1414 {
1415 // optimization: return maximal space which could be needed for this
1416 // string instead of the exact amount which could be less if there are
1417 // any surrogates in the input
1418 //
1419 // we consider that surrogates are rare enough to make it worthwhile to
1420 // avoid running the loop below at the cost of slightly extra memory
1421 // consumption
1422 return srcLen*BYTES_PER_CHAR;
1423 }
1424
1425 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1426 size_t outLen = 0;
1427 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1428 {
1429 const wxUint32 ch = wxDecodeSurrogate(&src);
1430 if ( !src )
1431 return wxCONV_FAILED;
1432
1433 outLen += BYTES_PER_CHAR;
1434
1435 if ( outLen > dstLen )
1436 return wxCONV_FAILED;
1437
1438 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1439 }
1440
1441 return outLen;
1442 }
1443
1444 #else // !WC_UTF16: wchar_t is UTF-32
1445
1446 // ----------------------------------------------------------------------------
1447 // conversions without endianness change
1448 // ----------------------------------------------------------------------------
1449
1450 size_t
1451 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1452 const char *src, size_t srcLen) const
1453 {
1454 // use memcpy() as it should be much faster than hand-written loop
1455 srcLen = GetLength(src, srcLen);
1456 if ( srcLen == wxNO_LEN )
1457 return wxCONV_FAILED;
1458
1459 const size_t inLen = srcLen/BYTES_PER_CHAR;
1460 if ( dst )
1461 {
1462 if ( dstLen < inLen )
1463 return wxCONV_FAILED;
1464
1465 memcpy(dst, src, srcLen);
1466 }
1467
1468 return inLen;
1469 }
1470
1471 size_t
1472 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1473 const wchar_t *src, size_t srcLen) const
1474 {
1475 if ( srcLen == wxNO_LEN )
1476 srcLen = wxWcslen(src) + 1;
1477
1478 srcLen *= BYTES_PER_CHAR;
1479
1480 if ( dst )
1481 {
1482 if ( dstLen < srcLen )
1483 return wxCONV_FAILED;
1484
1485 memcpy(dst, src, srcLen);
1486 }
1487
1488 return srcLen;
1489 }
1490
1491 // ----------------------------------------------------------------------------
1492 // endian-reversing conversions
1493 // ----------------------------------------------------------------------------
1494
1495 size_t
1496 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1497 const char *src, size_t srcLen) const
1498 {
1499 srcLen = GetLength(src, srcLen);
1500 if ( srcLen == wxNO_LEN )
1501 return wxCONV_FAILED;
1502
1503 srcLen /= BYTES_PER_CHAR;
1504
1505 if ( dst )
1506 {
1507 if ( dstLen < srcLen )
1508 return wxCONV_FAILED;
1509
1510 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1511 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1512 {
1513 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1514 }
1515 }
1516
1517 return srcLen;
1518 }
1519
1520 size_t
1521 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1522 const wchar_t *src, size_t srcLen) const
1523 {
1524 if ( srcLen == wxNO_LEN )
1525 srcLen = wxWcslen(src) + 1;
1526
1527 srcLen *= BYTES_PER_CHAR;
1528
1529 if ( dst )
1530 {
1531 if ( dstLen < srcLen )
1532 return wxCONV_FAILED;
1533
1534 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1535 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1536 {
1537 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1538 }
1539 }
1540
1541 return srcLen;
1542 }
1543
1544 #endif // WC_UTF16/!WC_UTF16
1545
1546
1547 // ============================================================================
1548 // The classes doing conversion using the iconv_xxx() functions
1549 // ============================================================================
1550
1551 #ifdef HAVE_ICONV
1552
1553 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1554 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1555 // (unless there's yet another bug in glibc) the only case when iconv()
1556 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1557 // left in the input buffer -- when _real_ error occurs,
1558 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1559 // iconv() failure.
1560 // [This bug does not appear in glibc 2.2.]
1561 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1562 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1563 (errno != E2BIG || bufLeft != 0))
1564 #else
1565 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1566 #endif
1567
1568 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1569
1570 #define ICONV_T_INVALID ((iconv_t)-1)
1571
1572 #if SIZEOF_WCHAR_T == 4
1573 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1574 #define WC_ENC wxFONTENCODING_UTF32
1575 #elif SIZEOF_WCHAR_T == 2
1576 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1577 #define WC_ENC wxFONTENCODING_UTF16
1578 #else // sizeof(wchar_t) != 2 nor 4
1579 // does this ever happen?
1580 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1581 #endif
1582
1583 // ----------------------------------------------------------------------------
1584 // wxMBConv_iconv: encapsulates an iconv character set
1585 // ----------------------------------------------------------------------------
1586
1587 class wxMBConv_iconv : public wxMBConv
1588 {
1589 public:
1590 wxMBConv_iconv(const wxChar *name);
1591 virtual ~wxMBConv_iconv();
1592
1593 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1594 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1595
1596 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1597 virtual size_t GetMBNulLen() const;
1598
1599 virtual wxMBConv *Clone() const
1600 {
1601 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1602 p->m_minMBCharWidth = m_minMBCharWidth;
1603 return p;
1604 }
1605
1606 bool IsOk() const
1607 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1608
1609 protected:
1610 // the iconv handlers used to translate from multibyte
1611 // to wide char and in the other direction
1612 iconv_t m2w,
1613 w2m;
1614
1615 #if wxUSE_THREADS
1616 // guards access to m2w and w2m objects
1617 wxMutex m_iconvMutex;
1618 #endif
1619
1620 private:
1621 // the name (for iconv_open()) of a wide char charset -- if none is
1622 // available on this machine, it will remain NULL
1623 static wxString ms_wcCharsetName;
1624
1625 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1626 // different endian-ness than the native one
1627 static bool ms_wcNeedsSwap;
1628
1629
1630 // name of the encoding handled by this conversion
1631 wxString m_name;
1632
1633 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1634 // initially
1635 size_t m_minMBCharWidth;
1636 };
1637
1638 // make the constructor available for unit testing
1639 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1640 {
1641 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1642 if ( !result->IsOk() )
1643 {
1644 delete result;
1645 return 0;
1646 }
1647
1648 return result;
1649 }
1650
1651 wxString wxMBConv_iconv::ms_wcCharsetName;
1652 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1653
1654 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1655 : m_name(name)
1656 {
1657 m_minMBCharWidth = 0;
1658
1659 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1660 // names for the charsets
1661 const wxCharBuffer cname(wxString(name).ToAscii());
1662
1663 // check for charset that represents wchar_t:
1664 if ( ms_wcCharsetName.empty() )
1665 {
1666 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1667
1668 #if wxUSE_FONTMAP
1669 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1670 #else // !wxUSE_FONTMAP
1671 static const wxChar *names[] =
1672 {
1673 #if SIZEOF_WCHAR_T == 4
1674 _T("UCS-4"),
1675 #elif SIZEOF_WCHAR_T = 2
1676 _T("UCS-2"),
1677 #endif
1678 NULL
1679 };
1680 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1681
1682 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1683 {
1684 const wxString nameCS(*names);
1685
1686 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1687 wxString nameXE(nameCS);
1688
1689 #ifdef WORDS_BIGENDIAN
1690 nameXE += _T("BE");
1691 #else // little endian
1692 nameXE += _T("LE");
1693 #endif
1694
1695 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1696 nameXE.c_str());
1697
1698 m2w = iconv_open(nameXE.ToAscii(), cname);
1699 if ( m2w == ICONV_T_INVALID )
1700 {
1701 // try charset w/o bytesex info (e.g. "UCS4")
1702 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1703 nameCS.c_str());
1704 m2w = iconv_open(nameCS.ToAscii(), cname);
1705
1706 // and check for bytesex ourselves:
1707 if ( m2w != ICONV_T_INVALID )
1708 {
1709 char buf[2], *bufPtr;
1710 wchar_t wbuf[2], *wbufPtr;
1711 size_t insz, outsz;
1712 size_t res;
1713
1714 buf[0] = 'A';
1715 buf[1] = 0;
1716 wbuf[0] = 0;
1717 insz = 2;
1718 outsz = SIZEOF_WCHAR_T * 2;
1719 wbufPtr = wbuf;
1720 bufPtr = buf;
1721
1722 res = iconv(
1723 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1724 (char**)&wbufPtr, &outsz);
1725
1726 if (ICONV_FAILED(res, insz))
1727 {
1728 wxLogLastError(wxT("iconv"));
1729 wxLogError(_("Conversion to charset '%s' doesn't work."),
1730 nameCS.c_str());
1731 }
1732 else // ok, can convert to this encoding, remember it
1733 {
1734 ms_wcCharsetName = nameCS;
1735 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1736 }
1737 }
1738 }
1739 else // use charset not requiring byte swapping
1740 {
1741 ms_wcCharsetName = nameXE;
1742 }
1743 }
1744
1745 wxLogTrace(TRACE_STRCONV,
1746 wxT("iconv wchar_t charset is \"%s\"%s"),
1747 ms_wcCharsetName.empty() ? _T("<none>")
1748 : ms_wcCharsetName.c_str(),
1749 ms_wcNeedsSwap ? _T(" (needs swap)")
1750 : _T(""));
1751 }
1752 else // we already have ms_wcCharsetName
1753 {
1754 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1755 }
1756
1757 if ( ms_wcCharsetName.empty() )
1758 {
1759 w2m = ICONV_T_INVALID;
1760 }
1761 else
1762 {
1763 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1764 if ( w2m == ICONV_T_INVALID )
1765 {
1766 wxLogTrace(TRACE_STRCONV,
1767 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1768 ms_wcCharsetName.c_str(), cname.data());
1769 }
1770 }
1771 }
1772
1773 wxMBConv_iconv::~wxMBConv_iconv()
1774 {
1775 if ( m2w != ICONV_T_INVALID )
1776 iconv_close(m2w);
1777 if ( w2m != ICONV_T_INVALID )
1778 iconv_close(w2m);
1779 }
1780
1781 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1782 {
1783 // find the string length: notice that must be done differently for
1784 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1785 size_t inbuf;
1786 const size_t nulLen = GetMBNulLen();
1787 switch ( nulLen )
1788 {
1789 default:
1790 return wxCONV_FAILED;
1791
1792 case 1:
1793 inbuf = strlen(psz); // arguably more optimized than our version
1794 break;
1795
1796 case 2:
1797 case 4:
1798 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1799 // they also have to start at character boundary and not span two
1800 // adjacent characters
1801 const char *p;
1802 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1803 ;
1804 inbuf = p - psz;
1805 break;
1806 }
1807
1808 #if wxUSE_THREADS
1809 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1810 // Unfortunately there is a couple of global wxCSConv objects such as
1811 // wxConvLocal that are used all over wx code, so we have to make sure
1812 // the handle is used by at most one thread at the time. Otherwise
1813 // only a few wx classes would be safe to use from non-main threads
1814 // as MB<->WC conversion would fail "randomly".
1815 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1816 #endif // wxUSE_THREADS
1817
1818 size_t outbuf = n * SIZEOF_WCHAR_T;
1819 size_t res, cres;
1820 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1821 wchar_t *bufPtr = buf;
1822 const char *pszPtr = psz;
1823
1824 if (buf)
1825 {
1826 // have destination buffer, convert there
1827 cres = iconv(m2w,
1828 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1829 (char**)&bufPtr, &outbuf);
1830 res = n - (outbuf / SIZEOF_WCHAR_T);
1831
1832 if (ms_wcNeedsSwap)
1833 {
1834 // convert to native endianness
1835 for ( unsigned i = 0; i < res; i++ )
1836 buf[n] = WC_BSWAP(buf[i]);
1837 }
1838
1839 // NUL-terminate the string if there is any space left
1840 if (res < n)
1841 buf[res] = 0;
1842 }
1843 else
1844 {
1845 // no destination buffer... convert using temp buffer
1846 // to calculate destination buffer requirement
1847 wchar_t tbuf[8];
1848 res = 0;
1849
1850 do
1851 {
1852 bufPtr = tbuf;
1853 outbuf = 8 * SIZEOF_WCHAR_T;
1854
1855 cres = iconv(m2w,
1856 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1857 (char**)&bufPtr, &outbuf );
1858
1859 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1860 }
1861 while ((cres == (size_t)-1) && (errno == E2BIG));
1862 }
1863
1864 if (ICONV_FAILED(cres, inbuf))
1865 {
1866 //VS: it is ok if iconv fails, hence trace only
1867 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1868 return wxCONV_FAILED;
1869 }
1870
1871 return res;
1872 }
1873
1874 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1875 {
1876 #if wxUSE_THREADS
1877 // NB: explained in MB2WC
1878 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1879 #endif
1880
1881 size_t inlen = wxWcslen(psz);
1882 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1883 size_t outbuf = n;
1884 size_t res, cres;
1885
1886 wchar_t *tmpbuf = 0;
1887
1888 if (ms_wcNeedsSwap)
1889 {
1890 // need to copy to temp buffer to switch endianness
1891 // (doing WC_BSWAP twice on the original buffer won't help, as it
1892 // could be in read-only memory, or be accessed in some other thread)
1893 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1894 for ( size_t i = 0; i < inlen; i++ )
1895 tmpbuf[n] = WC_BSWAP(psz[i]);
1896
1897 tmpbuf[inlen] = L'\0';
1898 psz = tmpbuf;
1899 }
1900
1901 if (buf)
1902 {
1903 // have destination buffer, convert there
1904 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1905
1906 res = n - outbuf;
1907
1908 // NB: iconv was given only wcslen(psz) characters on input, and so
1909 // it couldn't convert the trailing zero. Let's do it ourselves
1910 // if there's some room left for it in the output buffer.
1911 if (res < n)
1912 buf[0] = 0;
1913 }
1914 else
1915 {
1916 // no destination buffer: convert using temp buffer
1917 // to calculate destination buffer requirement
1918 char tbuf[16];
1919 res = 0;
1920 do
1921 {
1922 buf = tbuf;
1923 outbuf = 16;
1924
1925 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1926
1927 res += 16 - outbuf;
1928 }
1929 while ((cres == (size_t)-1) && (errno == E2BIG));
1930 }
1931
1932 if (ms_wcNeedsSwap)
1933 {
1934 free(tmpbuf);
1935 }
1936
1937 if (ICONV_FAILED(cres, inbuf))
1938 {
1939 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1940 return wxCONV_FAILED;
1941 }
1942
1943 return res;
1944 }
1945
1946 size_t wxMBConv_iconv::GetMBNulLen() const
1947 {
1948 if ( m_minMBCharWidth == 0 )
1949 {
1950 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1951
1952 #if wxUSE_THREADS
1953 // NB: explained in MB2WC
1954 wxMutexLocker lock(self->m_iconvMutex);
1955 #endif
1956
1957 wchar_t *wnul = L"";
1958 char buf[8]; // should be enough for NUL in any encoding
1959 size_t inLen = sizeof(wchar_t),
1960 outLen = WXSIZEOF(buf);
1961 char *inBuff = (char *)wnul;
1962 char *outBuff = buf;
1963 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1964 {
1965 self->m_minMBCharWidth = (size_t)-1;
1966 }
1967 else // ok
1968 {
1969 self->m_minMBCharWidth = outBuff - buf;
1970 }
1971 }
1972
1973 return m_minMBCharWidth;
1974 }
1975
1976 #endif // HAVE_ICONV
1977
1978
1979 // ============================================================================
1980 // Win32 conversion classes
1981 // ============================================================================
1982
1983 #ifdef wxHAVE_WIN32_MB2WC
1984
1985 // from utils.cpp
1986 #if wxUSE_FONTMAP
1987 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1988 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1989 #endif
1990
1991 class wxMBConv_win32 : public wxMBConv
1992 {
1993 public:
1994 wxMBConv_win32()
1995 {
1996 m_CodePage = CP_ACP;
1997 m_minMBCharWidth = 0;
1998 }
1999
2000 wxMBConv_win32(const wxMBConv_win32& conv)
2001 : wxMBConv()
2002 {
2003 m_CodePage = conv.m_CodePage;
2004 m_minMBCharWidth = conv.m_minMBCharWidth;
2005 }
2006
2007 #if wxUSE_FONTMAP
2008 wxMBConv_win32(const wxChar* name)
2009 {
2010 m_CodePage = wxCharsetToCodepage(name);
2011 m_minMBCharWidth = 0;
2012 }
2013
2014 wxMBConv_win32(wxFontEncoding encoding)
2015 {
2016 m_CodePage = wxEncodingToCodepage(encoding);
2017 m_minMBCharWidth = 0;
2018 }
2019 #endif // wxUSE_FONTMAP
2020
2021 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2022 {
2023 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2024 // the behaviour is not compatible with the Unix version (using iconv)
2025 // and break the library itself, e.g. wxTextInputStream::NextChar()
2026 // wouldn't work if reading an incomplete MB char didn't result in an
2027 // error
2028 //
2029 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2030 // Win XP or newer and it is not supported for UTF-[78] so we always
2031 // use our own conversions in this case. See
2032 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2033 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2034 if ( m_CodePage == CP_UTF8 )
2035 {
2036 return wxConvUTF8.MB2WC(buf, psz, n);
2037 }
2038
2039 if ( m_CodePage == CP_UTF7 )
2040 {
2041 return wxConvUTF7.MB2WC(buf, psz, n);
2042 }
2043
2044 int flags = 0;
2045 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2046 IsAtLeastWin2kSP4() )
2047 {
2048 flags = MB_ERR_INVALID_CHARS;
2049 }
2050
2051 const size_t len = ::MultiByteToWideChar
2052 (
2053 m_CodePage, // code page
2054 flags, // flags: fall on error
2055 psz, // input string
2056 -1, // its length (NUL-terminated)
2057 buf, // output string
2058 buf ? n : 0 // size of output buffer
2059 );
2060 if ( !len )
2061 {
2062 // function totally failed
2063 return wxCONV_FAILED;
2064 }
2065
2066 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2067 // check if we succeeded, by doing a double trip:
2068 if ( !flags && buf )
2069 {
2070 const size_t mbLen = strlen(psz);
2071 wxCharBuffer mbBuf(mbLen);
2072 if ( ::WideCharToMultiByte
2073 (
2074 m_CodePage,
2075 0,
2076 buf,
2077 -1,
2078 mbBuf.data(),
2079 mbLen + 1, // size in bytes, not length
2080 NULL,
2081 NULL
2082 ) == 0 ||
2083 strcmp(mbBuf, psz) != 0 )
2084 {
2085 // we didn't obtain the same thing we started from, hence
2086 // the conversion was lossy and we consider that it failed
2087 return wxCONV_FAILED;
2088 }
2089 }
2090
2091 // note that it returns count of written chars for buf != NULL and size
2092 // of the needed buffer for buf == NULL so in either case the length of
2093 // the string (which never includes the terminating NUL) is one less
2094 return len - 1;
2095 }
2096
2097 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2098 {
2099 /*
2100 we have a problem here: by default, WideCharToMultiByte() may
2101 replace characters unrepresentable in the target code page with bad
2102 quality approximations such as turning "1/2" symbol (U+00BD) into
2103 "1" for the code pages which don't have it and we, obviously, want
2104 to avoid this at any price
2105
2106 the trouble is that this function does it _silently_, i.e. it won't
2107 even tell us whether it did or not... Win98/2000 and higher provide
2108 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2109 we have to resort to a round trip, i.e. check that converting back
2110 results in the same string -- this is, of course, expensive but
2111 otherwise we simply can't be sure to not garble the data.
2112 */
2113
2114 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2115 // it doesn't work with CJK encodings (which we test for rather roughly
2116 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2117 // supporting it
2118 BOOL usedDef wxDUMMY_INITIALIZE(false);
2119 BOOL *pUsedDef;
2120 int flags;
2121 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2122 {
2123 // it's our lucky day
2124 flags = WC_NO_BEST_FIT_CHARS;
2125 pUsedDef = &usedDef;
2126 }
2127 else // old system or unsupported encoding
2128 {
2129 flags = 0;
2130 pUsedDef = NULL;
2131 }
2132
2133 const size_t len = ::WideCharToMultiByte
2134 (
2135 m_CodePage, // code page
2136 flags, // either none or no best fit
2137 pwz, // input string
2138 -1, // it is (wide) NUL-terminated
2139 buf, // output buffer
2140 buf ? n : 0, // and its size
2141 NULL, // default "replacement" char
2142 pUsedDef // [out] was it used?
2143 );
2144
2145 if ( !len )
2146 {
2147 // function totally failed
2148 return wxCONV_FAILED;
2149 }
2150
2151 // if we were really converting, check if we succeeded
2152 if ( buf )
2153 {
2154 if ( flags )
2155 {
2156 // check if the conversion failed, i.e. if any replacements
2157 // were done
2158 if ( usedDef )
2159 return wxCONV_FAILED;
2160 }
2161 else // we must resort to double tripping...
2162 {
2163 wxWCharBuffer wcBuf(n);
2164 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2165 wcscmp(wcBuf, pwz) != 0 )
2166 {
2167 // we didn't obtain the same thing we started from, hence
2168 // the conversion was lossy and we consider that it failed
2169 return wxCONV_FAILED;
2170 }
2171 }
2172 }
2173
2174 // see the comment above for the reason of "len - 1"
2175 return len - 1;
2176 }
2177
2178 virtual size_t GetMBNulLen() const
2179 {
2180 if ( m_minMBCharWidth == 0 )
2181 {
2182 int len = ::WideCharToMultiByte
2183 (
2184 m_CodePage, // code page
2185 0, // no flags
2186 L"", // input string
2187 1, // translate just the NUL
2188 NULL, // output buffer
2189 0, // and its size
2190 NULL, // no replacement char
2191 NULL // [out] don't care if it was used
2192 );
2193
2194 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2195 switch ( len )
2196 {
2197 default:
2198 wxLogDebug(_T("Unexpected NUL length %d"), len);
2199 self->m_minMBCharWidth = (size_t)-1;
2200 break;
2201
2202 case 0:
2203 self->m_minMBCharWidth = (size_t)-1;
2204 break;
2205
2206 case 1:
2207 case 2:
2208 case 4:
2209 self->m_minMBCharWidth = len;
2210 break;
2211 }
2212 }
2213
2214 return m_minMBCharWidth;
2215 }
2216
2217 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2218
2219 bool IsOk() const { return m_CodePage != -1; }
2220
2221 private:
2222 static bool CanUseNoBestFit()
2223 {
2224 static int s_isWin98Or2k = -1;
2225
2226 if ( s_isWin98Or2k == -1 )
2227 {
2228 int verMaj, verMin;
2229 switch ( wxGetOsVersion(&verMaj, &verMin) )
2230 {
2231 case wxWIN95:
2232 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2233 break;
2234
2235 case wxWINDOWS_NT:
2236 s_isWin98Or2k = verMaj >= 5;
2237 break;
2238
2239 default:
2240 // unknown: be conservative by default
2241 s_isWin98Or2k = 0;
2242 break;
2243 }
2244
2245 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2246 }
2247
2248 return s_isWin98Or2k == 1;
2249 }
2250
2251 static bool IsAtLeastWin2kSP4()
2252 {
2253 #ifdef __WXWINCE__
2254 return false;
2255 #else
2256 static int s_isAtLeastWin2kSP4 = -1;
2257
2258 if ( s_isAtLeastWin2kSP4 == -1 )
2259 {
2260 OSVERSIONINFOEX ver;
2261
2262 memset(&ver, 0, sizeof(ver));
2263 ver.dwOSVersionInfoSize = sizeof(ver);
2264 GetVersionEx((OSVERSIONINFO*)&ver);
2265
2266 s_isAtLeastWin2kSP4 =
2267 ((ver.dwMajorVersion > 5) || // Vista+
2268 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2269 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2270 ver.wServicePackMajor >= 4)) // 2000 SP4+
2271 ? 1 : 0;
2272 }
2273
2274 return s_isAtLeastWin2kSP4 == 1;
2275 #endif
2276 }
2277
2278
2279 // the code page we're working with
2280 long m_CodePage;
2281
2282 // cached result of GetMBNulLen(), set to 0 initially meaning
2283 // "unknown"
2284 size_t m_minMBCharWidth;
2285 };
2286
2287 #endif // wxHAVE_WIN32_MB2WC
2288
2289 // ============================================================================
2290 // Cocoa conversion classes
2291 // ============================================================================
2292
2293 #if defined(__WXCOCOA__)
2294
2295 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2296 // Strangely enough, internally Core Foundation uses
2297 // UTF-32 internally quite a bit - its just not public (yet).
2298
2299 #include <CoreFoundation/CFString.h>
2300 #include <CoreFoundation/CFStringEncodingExt.h>
2301
2302 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2303 {
2304 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2305
2306 switch (encoding)
2307 {
2308 case wxFONTENCODING_DEFAULT :
2309 enc = CFStringGetSystemEncoding();
2310 break ;
2311
2312 case wxFONTENCODING_ISO8859_1 :
2313 enc = kCFStringEncodingISOLatin1 ;
2314 break ;
2315 case wxFONTENCODING_ISO8859_2 :
2316 enc = kCFStringEncodingISOLatin2;
2317 break ;
2318 case wxFONTENCODING_ISO8859_3 :
2319 enc = kCFStringEncodingISOLatin3 ;
2320 break ;
2321 case wxFONTENCODING_ISO8859_4 :
2322 enc = kCFStringEncodingISOLatin4;
2323 break ;
2324 case wxFONTENCODING_ISO8859_5 :
2325 enc = kCFStringEncodingISOLatinCyrillic;
2326 break ;
2327 case wxFONTENCODING_ISO8859_6 :
2328 enc = kCFStringEncodingISOLatinArabic;
2329 break ;
2330 case wxFONTENCODING_ISO8859_7 :
2331 enc = kCFStringEncodingISOLatinGreek;
2332 break ;
2333 case wxFONTENCODING_ISO8859_8 :
2334 enc = kCFStringEncodingISOLatinHebrew;
2335 break ;
2336 case wxFONTENCODING_ISO8859_9 :
2337 enc = kCFStringEncodingISOLatin5;
2338 break ;
2339 case wxFONTENCODING_ISO8859_10 :
2340 enc = kCFStringEncodingISOLatin6;
2341 break ;
2342 case wxFONTENCODING_ISO8859_11 :
2343 enc = kCFStringEncodingISOLatinThai;
2344 break ;
2345 case wxFONTENCODING_ISO8859_13 :
2346 enc = kCFStringEncodingISOLatin7;
2347 break ;
2348 case wxFONTENCODING_ISO8859_14 :
2349 enc = kCFStringEncodingISOLatin8;
2350 break ;
2351 case wxFONTENCODING_ISO8859_15 :
2352 enc = kCFStringEncodingISOLatin9;
2353 break ;
2354
2355 case wxFONTENCODING_KOI8 :
2356 enc = kCFStringEncodingKOI8_R;
2357 break ;
2358 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2359 enc = kCFStringEncodingDOSRussian;
2360 break ;
2361
2362 // case wxFONTENCODING_BULGARIAN :
2363 // enc = ;
2364 // break ;
2365
2366 case wxFONTENCODING_CP437 :
2367 enc = kCFStringEncodingDOSLatinUS ;
2368 break ;
2369 case wxFONTENCODING_CP850 :
2370 enc = kCFStringEncodingDOSLatin1;
2371 break ;
2372 case wxFONTENCODING_CP852 :
2373 enc = kCFStringEncodingDOSLatin2;
2374 break ;
2375 case wxFONTENCODING_CP855 :
2376 enc = kCFStringEncodingDOSCyrillic;
2377 break ;
2378 case wxFONTENCODING_CP866 :
2379 enc = kCFStringEncodingDOSRussian ;
2380 break ;
2381 case wxFONTENCODING_CP874 :
2382 enc = kCFStringEncodingDOSThai;
2383 break ;
2384 case wxFONTENCODING_CP932 :
2385 enc = kCFStringEncodingDOSJapanese;
2386 break ;
2387 case wxFONTENCODING_CP936 :
2388 enc = kCFStringEncodingDOSChineseSimplif ;
2389 break ;
2390 case wxFONTENCODING_CP949 :
2391 enc = kCFStringEncodingDOSKorean;
2392 break ;
2393 case wxFONTENCODING_CP950 :
2394 enc = kCFStringEncodingDOSChineseTrad;
2395 break ;
2396 case wxFONTENCODING_CP1250 :
2397 enc = kCFStringEncodingWindowsLatin2;
2398 break ;
2399 case wxFONTENCODING_CP1251 :
2400 enc = kCFStringEncodingWindowsCyrillic ;
2401 break ;
2402 case wxFONTENCODING_CP1252 :
2403 enc = kCFStringEncodingWindowsLatin1 ;
2404 break ;
2405 case wxFONTENCODING_CP1253 :
2406 enc = kCFStringEncodingWindowsGreek;
2407 break ;
2408 case wxFONTENCODING_CP1254 :
2409 enc = kCFStringEncodingWindowsLatin5;
2410 break ;
2411 case wxFONTENCODING_CP1255 :
2412 enc = kCFStringEncodingWindowsHebrew ;
2413 break ;
2414 case wxFONTENCODING_CP1256 :
2415 enc = kCFStringEncodingWindowsArabic ;
2416 break ;
2417 case wxFONTENCODING_CP1257 :
2418 enc = kCFStringEncodingWindowsBalticRim;
2419 break ;
2420 // This only really encodes to UTF7 (if that) evidently
2421 // case wxFONTENCODING_UTF7 :
2422 // enc = kCFStringEncodingNonLossyASCII ;
2423 // break ;
2424 case wxFONTENCODING_UTF8 :
2425 enc = kCFStringEncodingUTF8 ;
2426 break ;
2427 case wxFONTENCODING_EUC_JP :
2428 enc = kCFStringEncodingEUC_JP;
2429 break ;
2430 case wxFONTENCODING_UTF16 :
2431 enc = kCFStringEncodingUnicode ;
2432 break ;
2433 case wxFONTENCODING_MACROMAN :
2434 enc = kCFStringEncodingMacRoman ;
2435 break ;
2436 case wxFONTENCODING_MACJAPANESE :
2437 enc = kCFStringEncodingMacJapanese ;
2438 break ;
2439 case wxFONTENCODING_MACCHINESETRAD :
2440 enc = kCFStringEncodingMacChineseTrad ;
2441 break ;
2442 case wxFONTENCODING_MACKOREAN :
2443 enc = kCFStringEncodingMacKorean ;
2444 break ;
2445 case wxFONTENCODING_MACARABIC :
2446 enc = kCFStringEncodingMacArabic ;
2447 break ;
2448 case wxFONTENCODING_MACHEBREW :
2449 enc = kCFStringEncodingMacHebrew ;
2450 break ;
2451 case wxFONTENCODING_MACGREEK :
2452 enc = kCFStringEncodingMacGreek ;
2453 break ;
2454 case wxFONTENCODING_MACCYRILLIC :
2455 enc = kCFStringEncodingMacCyrillic ;
2456 break ;
2457 case wxFONTENCODING_MACDEVANAGARI :
2458 enc = kCFStringEncodingMacDevanagari ;
2459 break ;
2460 case wxFONTENCODING_MACGURMUKHI :
2461 enc = kCFStringEncodingMacGurmukhi ;
2462 break ;
2463 case wxFONTENCODING_MACGUJARATI :
2464 enc = kCFStringEncodingMacGujarati ;
2465 break ;
2466 case wxFONTENCODING_MACORIYA :
2467 enc = kCFStringEncodingMacOriya ;
2468 break ;
2469 case wxFONTENCODING_MACBENGALI :
2470 enc = kCFStringEncodingMacBengali ;
2471 break ;
2472 case wxFONTENCODING_MACTAMIL :
2473 enc = kCFStringEncodingMacTamil ;
2474 break ;
2475 case wxFONTENCODING_MACTELUGU :
2476 enc = kCFStringEncodingMacTelugu ;
2477 break ;
2478 case wxFONTENCODING_MACKANNADA :
2479 enc = kCFStringEncodingMacKannada ;
2480 break ;
2481 case wxFONTENCODING_MACMALAJALAM :
2482 enc = kCFStringEncodingMacMalayalam ;
2483 break ;
2484 case wxFONTENCODING_MACSINHALESE :
2485 enc = kCFStringEncodingMacSinhalese ;
2486 break ;
2487 case wxFONTENCODING_MACBURMESE :
2488 enc = kCFStringEncodingMacBurmese ;
2489 break ;
2490 case wxFONTENCODING_MACKHMER :
2491 enc = kCFStringEncodingMacKhmer ;
2492 break ;
2493 case wxFONTENCODING_MACTHAI :
2494 enc = kCFStringEncodingMacThai ;
2495 break ;
2496 case wxFONTENCODING_MACLAOTIAN :
2497 enc = kCFStringEncodingMacLaotian ;
2498 break ;
2499 case wxFONTENCODING_MACGEORGIAN :
2500 enc = kCFStringEncodingMacGeorgian ;
2501 break ;
2502 case wxFONTENCODING_MACARMENIAN :
2503 enc = kCFStringEncodingMacArmenian ;
2504 break ;
2505 case wxFONTENCODING_MACCHINESESIMP :
2506 enc = kCFStringEncodingMacChineseSimp ;
2507 break ;
2508 case wxFONTENCODING_MACTIBETAN :
2509 enc = kCFStringEncodingMacTibetan ;
2510 break ;
2511 case wxFONTENCODING_MACMONGOLIAN :
2512 enc = kCFStringEncodingMacMongolian ;
2513 break ;
2514 case wxFONTENCODING_MACETHIOPIC :
2515 enc = kCFStringEncodingMacEthiopic ;
2516 break ;
2517 case wxFONTENCODING_MACCENTRALEUR :
2518 enc = kCFStringEncodingMacCentralEurRoman ;
2519 break ;
2520 case wxFONTENCODING_MACVIATNAMESE :
2521 enc = kCFStringEncodingMacVietnamese ;
2522 break ;
2523 case wxFONTENCODING_MACARABICEXT :
2524 enc = kCFStringEncodingMacExtArabic ;
2525 break ;
2526 case wxFONTENCODING_MACSYMBOL :
2527 enc = kCFStringEncodingMacSymbol ;
2528 break ;
2529 case wxFONTENCODING_MACDINGBATS :
2530 enc = kCFStringEncodingMacDingbats ;
2531 break ;
2532 case wxFONTENCODING_MACTURKISH :
2533 enc = kCFStringEncodingMacTurkish ;
2534 break ;
2535 case wxFONTENCODING_MACCROATIAN :
2536 enc = kCFStringEncodingMacCroatian ;
2537 break ;
2538 case wxFONTENCODING_MACICELANDIC :
2539 enc = kCFStringEncodingMacIcelandic ;
2540 break ;
2541 case wxFONTENCODING_MACROMANIAN :
2542 enc = kCFStringEncodingMacRomanian ;
2543 break ;
2544 case wxFONTENCODING_MACCELTIC :
2545 enc = kCFStringEncodingMacCeltic ;
2546 break ;
2547 case wxFONTENCODING_MACGAELIC :
2548 enc = kCFStringEncodingMacGaelic ;
2549 break ;
2550 // case wxFONTENCODING_MACKEYBOARD :
2551 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2552 // break ;
2553
2554 default :
2555 // because gcc is picky
2556 break ;
2557 }
2558
2559 return enc ;
2560 }
2561
2562 class wxMBConv_cocoa : public wxMBConv
2563 {
2564 public:
2565 wxMBConv_cocoa()
2566 {
2567 Init(CFStringGetSystemEncoding()) ;
2568 }
2569
2570 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2571 {
2572 m_encoding = conv.m_encoding;
2573 }
2574
2575 #if wxUSE_FONTMAP
2576 wxMBConv_cocoa(const wxChar* name)
2577 {
2578 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2579 }
2580 #endif
2581
2582 wxMBConv_cocoa(wxFontEncoding encoding)
2583 {
2584 Init( wxCFStringEncFromFontEnc(encoding) );
2585 }
2586
2587 ~wxMBConv_cocoa()
2588 {
2589 }
2590
2591 void Init( CFStringEncoding encoding)
2592 {
2593 m_encoding = encoding ;
2594 }
2595
2596 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2597 {
2598 wxASSERT(szUnConv);
2599
2600 CFStringRef theString = CFStringCreateWithBytes (
2601 NULL, //the allocator
2602 (const UInt8*)szUnConv,
2603 strlen(szUnConv),
2604 m_encoding,
2605 false //no BOM/external representation
2606 );
2607
2608 wxASSERT(theString);
2609
2610 size_t nOutLength = CFStringGetLength(theString);
2611
2612 if (szOut == NULL)
2613 {
2614 CFRelease(theString);
2615 return nOutLength;
2616 }
2617
2618 CFRange theRange = { 0, nOutSize };
2619
2620 #if SIZEOF_WCHAR_T == 4
2621 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2622 #endif
2623
2624 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2625
2626 CFRelease(theString);
2627
2628 szUniCharBuffer[nOutLength] = '\0';
2629
2630 #if SIZEOF_WCHAR_T == 4
2631 wxMBConvUTF16 converter;
2632 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2633 delete [] szUniCharBuffer;
2634 #endif
2635
2636 return nOutLength;
2637 }
2638
2639 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2640 {
2641 wxASSERT(szUnConv);
2642
2643 size_t nRealOutSize;
2644 size_t nBufSize = wxWcslen(szUnConv);
2645 UniChar* szUniBuffer = (UniChar*) szUnConv;
2646
2647 #if SIZEOF_WCHAR_T == 4
2648 wxMBConvUTF16 converter ;
2649 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2650 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2651 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2652 nBufSize /= sizeof(UniChar);
2653 #endif
2654
2655 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2656 NULL, //allocator
2657 szUniBuffer,
2658 nBufSize,
2659 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2660 );
2661
2662 wxASSERT(theString);
2663
2664 //Note that CER puts a BOM when converting to unicode
2665 //so we check and use getchars instead in that case
2666 if (m_encoding == kCFStringEncodingUnicode)
2667 {
2668 if (szOut != NULL)
2669 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2670
2671 nRealOutSize = CFStringGetLength(theString) + 1;
2672 }
2673 else
2674 {
2675 CFStringGetBytes(
2676 theString,
2677 CFRangeMake(0, CFStringGetLength(theString)),
2678 m_encoding,
2679 0, //what to put in characters that can't be converted -
2680 //0 tells CFString to return NULL if it meets such a character
2681 false, //not an external representation
2682 (UInt8*) szOut,
2683 nOutSize,
2684 (CFIndex*) &nRealOutSize
2685 );
2686 }
2687
2688 CFRelease(theString);
2689
2690 #if SIZEOF_WCHAR_T == 4
2691 delete[] szUniBuffer;
2692 #endif
2693
2694 return nRealOutSize - 1;
2695 }
2696
2697 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2698
2699 bool IsOk() const
2700 {
2701 return m_encoding != kCFStringEncodingInvalidId &&
2702 CFStringIsEncodingAvailable(m_encoding);
2703 }
2704
2705 private:
2706 CFStringEncoding m_encoding ;
2707 };
2708
2709 #endif // defined(__WXCOCOA__)
2710
2711 // ============================================================================
2712 // Mac conversion classes
2713 // ============================================================================
2714
2715 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2716
2717 class wxMBConv_mac : public wxMBConv
2718 {
2719 public:
2720 wxMBConv_mac()
2721 {
2722 Init(CFStringGetSystemEncoding()) ;
2723 }
2724
2725 wxMBConv_mac(const wxMBConv_mac& conv)
2726 {
2727 Init(conv.m_char_encoding);
2728 }
2729
2730 #if wxUSE_FONTMAP
2731 wxMBConv_mac(const wxChar* name)
2732 {
2733 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2734 }
2735 #endif
2736
2737 wxMBConv_mac(wxFontEncoding encoding)
2738 {
2739 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2740 }
2741
2742 ~wxMBConv_mac()
2743 {
2744 OSStatus status = noErr ;
2745 status = TECDisposeConverter(m_MB2WC_converter);
2746 status = TECDisposeConverter(m_WC2MB_converter);
2747 }
2748
2749
2750 void Init( TextEncodingBase encoding)
2751 {
2752 OSStatus status = noErr ;
2753 m_char_encoding = encoding ;
2754 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2755
2756 status = TECCreateConverter(&m_MB2WC_converter,
2757 m_char_encoding,
2758 m_unicode_encoding);
2759 status = TECCreateConverter(&m_WC2MB_converter,
2760 m_unicode_encoding,
2761 m_char_encoding);
2762 }
2763
2764 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2765 {
2766 OSStatus status = noErr ;
2767 ByteCount byteOutLen ;
2768 ByteCount byteInLen = strlen(psz) + 1;
2769 wchar_t *tbuf = NULL ;
2770 UniChar* ubuf = NULL ;
2771 size_t res = 0 ;
2772
2773 if (buf == NULL)
2774 {
2775 // Apple specs say at least 32
2776 n = wxMax( 32, byteInLen ) ;
2777 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2778 }
2779
2780 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2781
2782 #if SIZEOF_WCHAR_T == 4
2783 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2784 #else
2785 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2786 #endif
2787
2788 status = TECConvertText(
2789 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2790 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2791
2792 #if SIZEOF_WCHAR_T == 4
2793 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2794 // is not properly terminated we get random characters at the end
2795 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2796 wxMBConvUTF16 converter ;
2797 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2798 free( ubuf ) ;
2799 #else
2800 res = byteOutLen / sizeof( UniChar ) ;
2801 #endif
2802
2803 if ( buf == NULL )
2804 free(tbuf) ;
2805
2806 if ( buf && res < n)
2807 buf[res] = 0;
2808
2809 return res ;
2810 }
2811
2812 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2813 {
2814 OSStatus status = noErr ;
2815 ByteCount byteOutLen ;
2816 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2817
2818 char *tbuf = NULL ;
2819
2820 if (buf == NULL)
2821 {
2822 // Apple specs say at least 32
2823 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2824 tbuf = (char*) malloc( n ) ;
2825 }
2826
2827 ByteCount byteBufferLen = n ;
2828 UniChar* ubuf = NULL ;
2829
2830 #if SIZEOF_WCHAR_T == 4
2831 wxMBConvUTF16 converter ;
2832 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2833 byteInLen = unicharlen ;
2834 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2835 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2836 #else
2837 ubuf = (UniChar*) psz ;
2838 #endif
2839
2840 status = TECConvertText(
2841 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2842 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2843
2844 #if SIZEOF_WCHAR_T == 4
2845 free( ubuf ) ;
2846 #endif
2847
2848 if ( buf == NULL )
2849 free(tbuf) ;
2850
2851 size_t res = byteOutLen ;
2852 if ( buf && res < n)
2853 {
2854 buf[res] = 0;
2855
2856 //we need to double-trip to verify it didn't insert any ? in place
2857 //of bogus characters
2858 wxWCharBuffer wcBuf(n);
2859 size_t pszlen = wxWcslen(psz);
2860 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2861 wxWcslen(wcBuf) != pszlen ||
2862 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2863 {
2864 // we didn't obtain the same thing we started from, hence
2865 // the conversion was lossy and we consider that it failed
2866 return wxCONV_FAILED;
2867 }
2868 }
2869
2870 return res ;
2871 }
2872
2873 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2874
2875 bool IsOk() const
2876 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2877
2878 private:
2879 TECObjectRef m_MB2WC_converter;
2880 TECObjectRef m_WC2MB_converter;
2881
2882 TextEncodingBase m_char_encoding;
2883 TextEncodingBase m_unicode_encoding;
2884 };
2885
2886 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2887
2888 // ============================================================================
2889 // wxEncodingConverter based conversion classes
2890 // ============================================================================
2891
2892 #if wxUSE_FONTMAP
2893
2894 class wxMBConv_wxwin : public wxMBConv
2895 {
2896 private:
2897 void Init()
2898 {
2899 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2900 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2901 }
2902
2903 public:
2904 // temporarily just use wxEncodingConverter stuff,
2905 // so that it works while a better implementation is built
2906 wxMBConv_wxwin(const wxChar* name)
2907 {
2908 if (name)
2909 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2910 else
2911 m_enc = wxFONTENCODING_SYSTEM;
2912
2913 Init();
2914 }
2915
2916 wxMBConv_wxwin(wxFontEncoding enc)
2917 {
2918 m_enc = enc;
2919
2920 Init();
2921 }
2922
2923 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2924 {
2925 size_t inbuf = strlen(psz);
2926 if (buf)
2927 {
2928 if (!m2w.Convert(psz, buf))
2929 return wxCONV_FAILED;
2930 }
2931 return inbuf;
2932 }
2933
2934 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2935 {
2936 const size_t inbuf = wxWcslen(psz);
2937 if (buf)
2938 {
2939 if (!w2m.Convert(psz, buf))
2940 return wxCONV_FAILED;
2941 }
2942
2943 return inbuf;
2944 }
2945
2946 virtual size_t GetMBNulLen() const
2947 {
2948 switch ( m_enc )
2949 {
2950 case wxFONTENCODING_UTF16BE:
2951 case wxFONTENCODING_UTF16LE:
2952 return 2;
2953
2954 case wxFONTENCODING_UTF32BE:
2955 case wxFONTENCODING_UTF32LE:
2956 return 4;
2957
2958 default:
2959 return 1;
2960 }
2961 }
2962
2963 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2964
2965 bool IsOk() const { return m_ok; }
2966
2967 public:
2968 wxFontEncoding m_enc;
2969 wxEncodingConverter m2w, w2m;
2970
2971 private:
2972 // were we initialized successfully?
2973 bool m_ok;
2974
2975 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2976 };
2977
2978 // make the constructors available for unit testing
2979 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2980 {
2981 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2982 if ( !result->IsOk() )
2983 {
2984 delete result;
2985 return 0;
2986 }
2987
2988 return result;
2989 }
2990
2991 #endif // wxUSE_FONTMAP
2992
2993 // ============================================================================
2994 // wxCSConv implementation
2995 // ============================================================================
2996
2997 void wxCSConv::Init()
2998 {
2999 m_name = NULL;
3000 m_convReal = NULL;
3001 m_deferred = true;
3002 }
3003
3004 wxCSConv::wxCSConv(const wxChar *charset)
3005 {
3006 Init();
3007
3008 if ( charset )
3009 {
3010 SetName(charset);
3011 }
3012
3013 #if wxUSE_FONTMAP
3014 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3015 #else
3016 m_encoding = wxFONTENCODING_SYSTEM;
3017 #endif
3018 }
3019
3020 wxCSConv::wxCSConv(wxFontEncoding encoding)
3021 {
3022 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3023 {
3024 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3025
3026 encoding = wxFONTENCODING_SYSTEM;
3027 }
3028
3029 Init();
3030
3031 m_encoding = encoding;
3032 }
3033
3034 wxCSConv::~wxCSConv()
3035 {
3036 Clear();
3037 }
3038
3039 wxCSConv::wxCSConv(const wxCSConv& conv)
3040 : wxMBConv()
3041 {
3042 Init();
3043
3044 SetName(conv.m_name);
3045 m_encoding = conv.m_encoding;
3046 }
3047
3048 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3049 {
3050 Clear();
3051
3052 SetName(conv.m_name);
3053 m_encoding = conv.m_encoding;
3054
3055 return *this;
3056 }
3057
3058 void wxCSConv::Clear()
3059 {
3060 free(m_name);
3061 delete m_convReal;
3062
3063 m_name = NULL;
3064 m_convReal = NULL;
3065 }
3066
3067 void wxCSConv::SetName(const wxChar *charset)
3068 {
3069 if (charset)
3070 {
3071 m_name = wxStrdup(charset);
3072 m_deferred = true;
3073 }
3074 }
3075
3076 #if wxUSE_FONTMAP
3077
3078 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3079 wxEncodingNameCache );
3080
3081 static wxEncodingNameCache gs_nameCache;
3082 #endif
3083
3084 wxMBConv *wxCSConv::DoCreate() const
3085 {
3086 #if wxUSE_FONTMAP
3087 wxLogTrace(TRACE_STRCONV,
3088 wxT("creating conversion for %s"),
3089 (m_name ? m_name
3090 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3091 #endif // wxUSE_FONTMAP
3092
3093 // check for the special case of ASCII or ISO8859-1 charset: as we have
3094 // special knowledge of it anyhow, we don't need to create a special
3095 // conversion object
3096 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3097 m_encoding == wxFONTENCODING_DEFAULT )
3098 {
3099 // don't convert at all
3100 return NULL;
3101 }
3102
3103 // we trust OS to do conversion better than we can so try external
3104 // conversion methods first
3105 //
3106 // the full order is:
3107 // 1. OS conversion (iconv() under Unix or Win32 API)
3108 // 2. hard coded conversions for UTF
3109 // 3. wxEncodingConverter as fall back
3110
3111 // step (1)
3112 #ifdef HAVE_ICONV
3113 #if !wxUSE_FONTMAP
3114 if ( m_name )
3115 #endif // !wxUSE_FONTMAP
3116 {
3117 wxString name(m_name);
3118 wxFontEncoding encoding(m_encoding);
3119
3120 if ( !name.empty() )
3121 {
3122 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3123 if ( conv->IsOk() )
3124 return conv;
3125
3126 delete conv;
3127
3128 #if wxUSE_FONTMAP
3129 encoding =
3130 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3131 #endif // wxUSE_FONTMAP
3132 }
3133 #if wxUSE_FONTMAP
3134 {
3135 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3136 if ( it != gs_nameCache.end() )
3137 {
3138 if ( it->second.empty() )
3139 return NULL;
3140
3141 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3142 if ( conv->IsOk() )
3143 return conv;
3144
3145 delete conv;
3146 }
3147
3148 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3149
3150 for ( ; *names; ++names )
3151 {
3152 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3153 if ( conv->IsOk() )
3154 {
3155 gs_nameCache[encoding] = *names;
3156 return conv;
3157 }
3158
3159 delete conv;
3160 }
3161
3162 gs_nameCache[encoding] = _T(""); // cache the failure
3163 }
3164 #endif // wxUSE_FONTMAP
3165 }
3166 #endif // HAVE_ICONV
3167
3168 #ifdef wxHAVE_WIN32_MB2WC
3169 {
3170 #if wxUSE_FONTMAP
3171 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3172 : new wxMBConv_win32(m_encoding);
3173 if ( conv->IsOk() )
3174 return conv;
3175
3176 delete conv;
3177 #else
3178 return NULL;
3179 #endif
3180 }
3181 #endif // wxHAVE_WIN32_MB2WC
3182
3183 #if defined(__WXMAC__)
3184 {
3185 // leave UTF16 and UTF32 to the built-ins of wx
3186 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3187 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3188 {
3189 #if wxUSE_FONTMAP
3190 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3191 : new wxMBConv_mac(m_encoding);
3192 #else
3193 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3194 #endif
3195 if ( conv->IsOk() )
3196 return conv;
3197
3198 delete conv;
3199 }
3200 }
3201 #endif
3202
3203 #if defined(__WXCOCOA__)
3204 {
3205 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3206 {
3207 #if wxUSE_FONTMAP
3208 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3209 : new wxMBConv_cocoa(m_encoding);
3210 #else
3211 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3212 #endif
3213
3214 if ( conv->IsOk() )
3215 return conv;
3216
3217 delete conv;
3218 }
3219 }
3220 #endif
3221 // step (2)
3222 wxFontEncoding enc = m_encoding;
3223 #if wxUSE_FONTMAP
3224 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3225 {
3226 // use "false" to suppress interactive dialogs -- we can be called from
3227 // anywhere and popping up a dialog from here is the last thing we want to
3228 // do
3229 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3230 }
3231 #endif // wxUSE_FONTMAP
3232
3233 switch ( enc )
3234 {
3235 case wxFONTENCODING_UTF7:
3236 return new wxMBConvUTF7;
3237
3238 case wxFONTENCODING_UTF8:
3239 return new wxMBConvUTF8;
3240
3241 case wxFONTENCODING_UTF16BE:
3242 return new wxMBConvUTF16BE;
3243
3244 case wxFONTENCODING_UTF16LE:
3245 return new wxMBConvUTF16LE;
3246
3247 case wxFONTENCODING_UTF32BE:
3248 return new wxMBConvUTF32BE;
3249
3250 case wxFONTENCODING_UTF32LE:
3251 return new wxMBConvUTF32LE;
3252
3253 default:
3254 // nothing to do but put here to suppress gcc warnings
3255 break;
3256 }
3257
3258 // step (3)
3259 #if wxUSE_FONTMAP
3260 {
3261 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3262 : new wxMBConv_wxwin(m_encoding);
3263 if ( conv->IsOk() )
3264 return conv;
3265
3266 delete conv;
3267 }
3268 #endif // wxUSE_FONTMAP
3269
3270 // NB: This is a hack to prevent deadlock. What could otherwise happen
3271 // in Unicode build: wxConvLocal creation ends up being here
3272 // because of some failure and logs the error. But wxLog will try to
3273 // attach timestamp, for which it will need wxConvLocal (to convert
3274 // time to char* and then wchar_t*), but that fails, tries to log
3275 // error, but wxLog has a (already locked) critical section that
3276 // guards static buffer.
3277 static bool alreadyLoggingError = false;
3278 if (!alreadyLoggingError)
3279 {
3280 alreadyLoggingError = true;
3281 wxLogError(_("Cannot convert from the charset '%s'!"),
3282 m_name ? m_name
3283 :
3284 #if wxUSE_FONTMAP
3285 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3286 #else // !wxUSE_FONTMAP
3287 wxString::Format(_("encoding %s"), m_encoding).c_str()
3288 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3289 );
3290
3291 alreadyLoggingError = false;
3292 }
3293
3294 return NULL;
3295 }
3296
3297 void wxCSConv::CreateConvIfNeeded() const
3298 {
3299 if ( m_deferred )
3300 {
3301 wxCSConv *self = (wxCSConv *)this; // const_cast
3302
3303 #if wxUSE_INTL
3304 // if we don't have neither the name nor the encoding, use the default
3305 // encoding for this system
3306 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3307 {
3308 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3309 }
3310 #endif // wxUSE_INTL
3311
3312 self->m_convReal = DoCreate();
3313 self->m_deferred = false;
3314 }
3315 }
3316
3317 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3318 {
3319 CreateConvIfNeeded();
3320
3321 if (m_convReal)
3322 return m_convReal->MB2WC(buf, psz, n);
3323
3324 // latin-1 (direct)
3325 size_t len = strlen(psz);
3326
3327 if (buf)
3328 {
3329 for (size_t c = 0; c <= len; c++)
3330 buf[c] = (unsigned char)(psz[c]);
3331 }
3332
3333 return len;
3334 }
3335
3336 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3337 {
3338 CreateConvIfNeeded();
3339
3340 if (m_convReal)
3341 return m_convReal->WC2MB(buf, psz, n);
3342
3343 // latin-1 (direct)
3344 const size_t len = wxWcslen(psz);
3345 if (buf)
3346 {
3347 for (size_t c = 0; c <= len; c++)
3348 {
3349 if (psz[c] > 0xFF)
3350 return wxCONV_FAILED;
3351
3352 buf[c] = (char)psz[c];
3353 }
3354 }
3355 else
3356 {
3357 for (size_t c = 0; c <= len; c++)
3358 {
3359 if (psz[c] > 0xFF)
3360 return wxCONV_FAILED;
3361 }
3362 }
3363
3364 return len;
3365 }
3366
3367 size_t wxCSConv::GetMBNulLen() const
3368 {
3369 CreateConvIfNeeded();
3370
3371 if ( m_convReal )
3372 {
3373 return m_convReal->GetMBNulLen();
3374 }
3375
3376 return 1;
3377 }
3378
3379 // ----------------------------------------------------------------------------
3380 // globals
3381 // ----------------------------------------------------------------------------
3382
3383 #ifdef __WINDOWS__
3384 static wxMBConv_win32 wxConvLibcObj;
3385 #elif defined(__WXMAC__) && !defined(__MACH__)
3386 static wxMBConv_mac wxConvLibcObj ;
3387 #else
3388 static wxMBConvLibc wxConvLibcObj;
3389 #endif
3390
3391 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3392 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3393 static wxMBConvUTF7 wxConvUTF7Obj;
3394 static wxMBConvUTF8 wxConvUTF8Obj;
3395
3396 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3397 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3398 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3399 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3400 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3401 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3402 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3403 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3404 #ifdef __WXOSX__
3405 wxConvUTF8Obj;
3406 #else
3407 wxConvLibcObj;
3408 #endif
3409
3410 #else // !wxUSE_WCHAR_T
3411
3412 // stand-ins in absence of wchar_t
3413 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3414 wxConvISO8859_1,
3415 wxConvLocal,
3416 wxConvUTF8;
3417
3418 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T