explicitly call base class ctor to silent gcc warning (patch 1492701)
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifndef WX_PRECOMP
19 #include "wx/intl.h"
20 #include "wx/log.h"
21 #include "wx/utils.h"
22 #endif
23
24 #include "wx/strconv.h"
25
26 #if wxUSE_WCHAR_T
27
28 #ifdef __WINDOWS__
29 #include "wx/msw/private.h"
30 #include "wx/msw/missing.h"
31 #endif
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #define wxHAVE_WIN32_MB2WC
43 #endif
44
45 #ifdef __SALFORDC__
46 #include <clib.h>
47 #endif
48
49 #ifdef HAVE_ICONV
50 #include <iconv.h>
51 #include "wx/thread.h"
52 #endif
53
54 #include "wx/encconv.h"
55 #include "wx/fontmap.h"
56
57 #ifdef __WXMAC__
58 #ifndef __DARWIN__
59 #include <ATSUnicode.h>
60 #include <TextCommon.h>
61 #include <TextEncodingConverter.h>
62 #endif
63
64 // includes Mac headers
65 #include "wx/mac/private.h"
66 #endif
67
68
69 #define TRACE_STRCONV _T("strconv")
70
71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
72 // be 4 bytes
73 #if SIZEOF_WCHAR_T == 2
74 #define WC_UTF16
75 #endif
76
77
78 // ============================================================================
79 // implementation
80 // ============================================================================
81
82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
83 static bool NotAllNULs(const char *p, size_t n)
84 {
85 while ( n && *p++ == '\0' )
86 n--;
87
88 return n != 0;
89 }
90
91 // ----------------------------------------------------------------------------
92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
93 // ----------------------------------------------------------------------------
94
95 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
96 {
97 if (input <= 0xffff)
98 {
99 if (output)
100 *output = (wxUint16) input;
101
102 return 1;
103 }
104 else if (input >= 0x110000)
105 {
106 return wxCONV_FAILED;
107 }
108 else
109 {
110 if (output)
111 {
112 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
113 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
114 }
115
116 return 2;
117 }
118 }
119
120 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
121 {
122 if ((*input < 0xd800) || (*input > 0xdfff))
123 {
124 output = *input;
125 return 1;
126 }
127 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
128 {
129 output = *input;
130 return wxCONV_FAILED;
131 }
132 else
133 {
134 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
135 return 2;
136 }
137 }
138
139 #ifdef WC_UTF16
140 typedef wchar_t wxDecodeSurrogate_t;
141 #else // !WC_UTF16
142 typedef wxUint16 wxDecodeSurrogate_t;
143 #endif // WC_UTF16/!WC_UTF16
144
145 // returns the next UTF-32 character from the wchar_t buffer and advances the
146 // pointer to the character after this one
147 //
148 // if an invalid character is found, *pSrc is set to NULL, the caller must
149 // check for this
150 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
151 {
152 wxUint32 out;
153 const size_t
154 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
155 if ( n == wxCONV_FAILED )
156 *pSrc = NULL;
157 else
158 *pSrc += n;
159
160 return out;
161 }
162
163 // ----------------------------------------------------------------------------
164 // wxMBConv
165 // ----------------------------------------------------------------------------
166
167 size_t
168 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
169 const char *src, size_t srcLen) const
170 {
171 // although new conversion classes are supposed to implement this function
172 // directly, the existins ones only implement the old MB2WC() and so, to
173 // avoid to have to rewrite all conversion classes at once, we provide a
174 // default (but not efficient) implementation of this one in terms of the
175 // old function by copying the input to ensure that it's NUL-terminated and
176 // then using MB2WC() to convert it
177
178 // the number of chars [which would be] written to dst [if it were not NULL]
179 size_t dstWritten = 0;
180
181 // the number of NULs terminating this string
182 size_t nulLen = 0; // not really needed, but just to avoid warnings
183
184 // if we were not given the input size we just have to assume that the
185 // string is properly terminated as we have no way of knowing how long it
186 // is anyhow, but if we do have the size check whether there are enough
187 // NULs at the end
188 wxCharBuffer bufTmp;
189 const char *srcEnd;
190 if ( srcLen != wxNO_LEN )
191 {
192 // we need to know how to find the end of this string
193 nulLen = GetMBNulLen();
194 if ( nulLen == wxCONV_FAILED )
195 return wxCONV_FAILED;
196
197 // if there are enough NULs we can avoid the copy
198 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
199 {
200 // make a copy in order to properly NUL-terminate the string
201 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
202 char * const p = bufTmp.data();
203 memcpy(p, src, srcLen);
204 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
205 *s = '\0';
206
207 src = bufTmp;
208 }
209
210 srcEnd = src + srcLen;
211 }
212 else // quit after the first loop iteration
213 {
214 srcEnd = NULL;
215 }
216
217 for ( ;; )
218 {
219 // try to convert the current chunk
220 size_t lenChunk = MB2WC(NULL, src, 0);
221 if ( lenChunk == wxCONV_FAILED )
222 return wxCONV_FAILED;
223
224 lenChunk++; // for the L'\0' at the end of this chunk
225
226 dstWritten += lenChunk;
227
228 if ( lenChunk == 1 )
229 {
230 // nothing left in the input string, conversion succeeded
231 break;
232 }
233
234 if ( dst )
235 {
236 if ( dstWritten > dstLen )
237 return wxCONV_FAILED;
238
239 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
240 return wxCONV_FAILED;
241
242 dst += lenChunk;
243 }
244
245 if ( !srcEnd )
246 {
247 // we convert just one chunk in this case as this is the entire
248 // string anyhow
249 break;
250 }
251
252 // advance the input pointer past the end of this chunk
253 while ( NotAllNULs(src, nulLen) )
254 {
255 // notice that we must skip over multiple bytes here as we suppose
256 // that if NUL takes 2 or 4 bytes, then all the other characters do
257 // too and so if advanced by a single byte we might erroneously
258 // detect sequences of NUL bytes in the middle of the input
259 src += nulLen;
260 }
261
262 src += nulLen; // skipping over its terminator as well
263
264 // note that ">=" (and not just "==") is needed here as the terminator
265 // we skipped just above could be inside or just after the buffer
266 // delimited by inEnd
267 if ( src >= srcEnd )
268 break;
269 }
270
271 return dstWritten;
272 }
273
274 size_t
275 wxMBConv::FromWChar(char *dst, size_t dstLen,
276 const wchar_t *src, size_t srcLen) const
277 {
278 // the number of chars [which would be] written to dst [if it were not NULL]
279 size_t dstWritten = 0;
280
281 // make a copy of the input string unless it is already properly
282 // NUL-terminated
283 //
284 // if we don't know its length we have no choice but to assume that it is,
285 // indeed, properly terminated
286 wxWCharBuffer bufTmp;
287 if ( srcLen == wxNO_LEN )
288 {
289 srcLen = wxWcslen(src) + 1;
290 }
291 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
292 {
293 // make a copy in order to properly NUL-terminate the string
294 bufTmp = wxWCharBuffer(srcLen);
295 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
296 src = bufTmp;
297 }
298
299 const size_t lenNul = GetMBNulLen();
300 for ( const wchar_t * const srcEnd = src + srcLen;
301 src < srcEnd;
302 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
303 {
304 // try to convert the current chunk
305 size_t lenChunk = WC2MB(NULL, src, 0);
306
307 if ( lenChunk == wxCONV_FAILED )
308 return wxCONV_FAILED;
309
310 lenChunk += lenNul;
311 dstWritten += lenChunk;
312
313 if ( dst )
314 {
315 if ( dstWritten > dstLen )
316 return wxCONV_FAILED;
317
318 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
319 return wxCONV_FAILED;
320
321 dst += lenChunk;
322 }
323 }
324
325 return dstWritten;
326 }
327
328 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
329 {
330 size_t rc = ToWChar(outBuff, outLen, inBuff);
331 if ( rc != wxCONV_FAILED )
332 {
333 // ToWChar() returns the buffer length, i.e. including the trailing
334 // NUL, while this method doesn't take it into account
335 rc--;
336 }
337
338 return rc;
339 }
340
341 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
342 {
343 size_t rc = FromWChar(outBuff, outLen, inBuff);
344 if ( rc != wxCONV_FAILED )
345 {
346 rc -= GetMBNulLen();
347 }
348
349 return rc;
350 }
351
352 wxMBConv::~wxMBConv()
353 {
354 // nothing to do here (necessary for Darwin linking probably)
355 }
356
357 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
358 {
359 if ( psz )
360 {
361 // calculate the length of the buffer needed first
362 const size_t nLen = MB2WC(NULL, psz, 0);
363 if ( nLen != wxCONV_FAILED )
364 {
365 // now do the actual conversion
366 wxWCharBuffer buf(nLen /* +1 added implicitly */);
367
368 // +1 for the trailing NULL
369 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
370 return buf;
371 }
372 }
373
374 return wxWCharBuffer();
375 }
376
377 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
378 {
379 if ( pwz )
380 {
381 const size_t nLen = WC2MB(NULL, pwz, 0);
382 if ( nLen != wxCONV_FAILED )
383 {
384 // extra space for trailing NUL(s)
385 static const size_t extraLen = GetMaxMBNulLen();
386
387 wxCharBuffer buf(nLen + extraLen - 1);
388 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
389 return buf;
390 }
391 }
392
393 return wxCharBuffer();
394 }
395
396 const wxWCharBuffer
397 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
398 {
399 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
400 if ( dstLen != wxCONV_FAILED )
401 {
402 wxWCharBuffer wbuf(dstLen - 1);
403 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
404 {
405 if ( outLen )
406 {
407 *outLen = dstLen;
408 if ( wbuf[dstLen - 1] == L'\0' )
409 (*outLen)--;
410 }
411
412 return wbuf;
413 }
414 }
415
416 if ( outLen )
417 *outLen = 0;
418
419 return wxWCharBuffer();
420 }
421
422 const wxCharBuffer
423 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
424 {
425 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
426 if ( dstLen != wxCONV_FAILED )
427 {
428 // special case of empty input: can't allocate 0 size buffer below as
429 // wxCharBuffer insists on NUL-terminating it
430 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
431 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
432 {
433 if ( outLen )
434 {
435 *outLen = dstLen;
436
437 const size_t nulLen = GetMBNulLen();
438 if ( dstLen >= nulLen &&
439 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
440 {
441 // in this case the output is NUL-terminated and we're not
442 // supposed to count NUL
443 *outLen -= nulLen;
444 }
445 }
446
447 return buf;
448 }
449 }
450
451 if ( outLen )
452 *outLen = 0;
453
454 return wxCharBuffer();
455 }
456
457 // ----------------------------------------------------------------------------
458 // wxMBConvLibc
459 // ----------------------------------------------------------------------------
460
461 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
462 {
463 return wxMB2WC(buf, psz, n);
464 }
465
466 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
467 {
468 return wxWC2MB(buf, psz, n);
469 }
470
471 // ----------------------------------------------------------------------------
472 // wxConvBrokenFileNames
473 // ----------------------------------------------------------------------------
474
475 #ifdef __UNIX__
476
477 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
478 {
479 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
480 || wxStricmp(charset, _T("UTF8")) == 0 )
481 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
482 else
483 m_conv = new wxCSConv(charset);
484 }
485
486 #endif // __UNIX__
487
488 // ----------------------------------------------------------------------------
489 // UTF-7
490 // ----------------------------------------------------------------------------
491
492 // Implementation (C) 2004 Fredrik Roubert
493
494 //
495 // BASE64 decoding table
496 //
497 static const unsigned char utf7unb64[] =
498 {
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
505 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
506 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
508 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
509 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
510 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
512 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
513 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
514 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
531 };
532
533 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
534 {
535 size_t len = 0;
536
537 while ( *psz && (!buf || (len < n)) )
538 {
539 unsigned char cc = *psz++;
540 if (cc != '+')
541 {
542 // plain ASCII char
543 if (buf)
544 *buf++ = cc;
545 len++;
546 }
547 else if (*psz == '-')
548 {
549 // encoded plus sign
550 if (buf)
551 *buf++ = cc;
552 len++;
553 psz++;
554 }
555 else // start of BASE64 encoded string
556 {
557 bool lsb, ok;
558 unsigned int d, l;
559 for ( ok = lsb = false, d = 0, l = 0;
560 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
561 psz++ )
562 {
563 d <<= 6;
564 d += cc;
565 for (l += 6; l >= 8; lsb = !lsb)
566 {
567 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
568 if (lsb)
569 {
570 if (buf)
571 *buf++ |= c;
572 len ++;
573 }
574 else
575 {
576 if (buf)
577 *buf = (wchar_t)(c << 8);
578 }
579
580 ok = true;
581 }
582 }
583
584 if ( !ok )
585 {
586 // in valid UTF7 we should have valid characters after '+'
587 return wxCONV_FAILED;
588 }
589
590 if (*psz == '-')
591 psz++;
592 }
593 }
594
595 if ( buf && (len < n) )
596 *buf = '\0';
597
598 return len;
599 }
600
601 //
602 // BASE64 encoding table
603 //
604 static const unsigned char utf7enb64[] =
605 {
606 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
607 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
608 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
609 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
610 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
611 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
612 'w', 'x', 'y', 'z', '0', '1', '2', '3',
613 '4', '5', '6', '7', '8', '9', '+', '/'
614 };
615
616 //
617 // UTF-7 encoding table
618 //
619 // 0 - Set D (directly encoded characters)
620 // 1 - Set O (optional direct characters)
621 // 2 - whitespace characters (optional)
622 // 3 - special characters
623 //
624 static const unsigned char utf7encode[128] =
625 {
626 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
630 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
632 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
634 };
635
636 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
637 {
638 size_t len = 0;
639
640 while (*psz && ((!buf) || (len < n)))
641 {
642 wchar_t cc = *psz++;
643 if (cc < 0x80 && utf7encode[cc] < 1)
644 {
645 // plain ASCII char
646 if (buf)
647 *buf++ = (char)cc;
648
649 len++;
650 }
651 #ifndef WC_UTF16
652 else if (((wxUint32)cc) > 0xffff)
653 {
654 // no surrogate pair generation (yet?)
655 return wxCONV_FAILED;
656 }
657 #endif
658 else
659 {
660 if (buf)
661 *buf++ = '+';
662
663 len++;
664 if (cc != '+')
665 {
666 // BASE64 encode string
667 unsigned int lsb, d, l;
668 for (d = 0, l = 0; /*nothing*/; psz++)
669 {
670 for (lsb = 0; lsb < 2; lsb ++)
671 {
672 d <<= 8;
673 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
674
675 for (l += 8; l >= 6; )
676 {
677 l -= 6;
678 if (buf)
679 *buf++ = utf7enb64[(d >> l) % 64];
680 len++;
681 }
682 }
683
684 cc = *psz;
685 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
686 break;
687 }
688
689 if (l != 0)
690 {
691 if (buf)
692 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
693
694 len++;
695 }
696 }
697
698 if (buf)
699 *buf++ = '-';
700 len++;
701 }
702 }
703
704 if (buf && (len < n))
705 *buf = 0;
706
707 return len;
708 }
709
710 // ----------------------------------------------------------------------------
711 // UTF-8
712 // ----------------------------------------------------------------------------
713
714 static wxUint32 utf8_max[]=
715 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
716
717 // boundaries of the private use area we use to (temporarily) remap invalid
718 // characters invalid in a UTF-8 encoded string
719 const wxUint32 wxUnicodePUA = 0x100000;
720 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
721
722 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
723 {
724 size_t len = 0;
725
726 while (*psz && ((!buf) || (len < n)))
727 {
728 const char *opsz = psz;
729 bool invalid = false;
730 unsigned char cc = *psz++, fc = cc;
731 unsigned cnt;
732 for (cnt = 0; fc & 0x80; cnt++)
733 fc <<= 1;
734
735 if (!cnt)
736 {
737 // plain ASCII char
738 if (buf)
739 *buf++ = cc;
740 len++;
741
742 // escape the escape character for octal escapes
743 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
744 && cc == '\\' && (!buf || len < n))
745 {
746 if (buf)
747 *buf++ = cc;
748 len++;
749 }
750 }
751 else
752 {
753 cnt--;
754 if (!cnt)
755 {
756 // invalid UTF-8 sequence
757 invalid = true;
758 }
759 else
760 {
761 unsigned ocnt = cnt - 1;
762 wxUint32 res = cc & (0x3f >> cnt);
763 while (cnt--)
764 {
765 cc = *psz;
766 if ((cc & 0xC0) != 0x80)
767 {
768 // invalid UTF-8 sequence
769 invalid = true;
770 break;
771 }
772
773 psz++;
774 res = (res << 6) | (cc & 0x3f);
775 }
776
777 if (invalid || res <= utf8_max[ocnt])
778 {
779 // illegal UTF-8 encoding
780 invalid = true;
781 }
782 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
783 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
784 {
785 // if one of our PUA characters turns up externally
786 // it must also be treated as an illegal sequence
787 // (a bit like you have to escape an escape character)
788 invalid = true;
789 }
790 else
791 {
792 #ifdef WC_UTF16
793 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
794 size_t pa = encode_utf16(res, (wxUint16 *)buf);
795 if (pa == wxCONV_FAILED)
796 {
797 invalid = true;
798 }
799 else
800 {
801 if (buf)
802 buf += pa;
803 len += pa;
804 }
805 #else // !WC_UTF16
806 if (buf)
807 *buf++ = (wchar_t)res;
808 len++;
809 #endif // WC_UTF16/!WC_UTF16
810 }
811 }
812
813 if (invalid)
814 {
815 if (m_options & MAP_INVALID_UTF8_TO_PUA)
816 {
817 while (opsz < psz && (!buf || len < n))
818 {
819 #ifdef WC_UTF16
820 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
821 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
822 wxASSERT(pa != wxCONV_FAILED);
823 if (buf)
824 buf += pa;
825 opsz++;
826 len += pa;
827 #else
828 if (buf)
829 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
830 opsz++;
831 len++;
832 #endif
833 }
834 }
835 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
836 {
837 while (opsz < psz && (!buf || len < n))
838 {
839 if ( buf && len + 3 < n )
840 {
841 unsigned char on = *opsz;
842 *buf++ = L'\\';
843 *buf++ = (wchar_t)( L'0' + on / 0100 );
844 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
845 *buf++ = (wchar_t)( L'0' + on % 010 );
846 }
847
848 opsz++;
849 len += 4;
850 }
851 }
852 else // MAP_INVALID_UTF8_NOT
853 {
854 return wxCONV_FAILED;
855 }
856 }
857 }
858 }
859
860 if (buf && (len < n))
861 *buf = 0;
862
863 return len;
864 }
865
866 static inline bool isoctal(wchar_t wch)
867 {
868 return L'0' <= wch && wch <= L'7';
869 }
870
871 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
872 {
873 size_t len = 0;
874
875 while (*psz && ((!buf) || (len < n)))
876 {
877 wxUint32 cc;
878
879 #ifdef WC_UTF16
880 // cast is ok for WC_UTF16
881 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
882 psz += (pa == wxCONV_FAILED) ? 1 : pa;
883 #else
884 cc = (*psz++) & 0x7fffffff;
885 #endif
886
887 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
888 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
889 {
890 if (buf)
891 *buf++ = (char)(cc - wxUnicodePUA);
892 len++;
893 }
894 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
895 && cc == L'\\' && psz[0] == L'\\' )
896 {
897 if (buf)
898 *buf++ = (char)cc;
899 psz++;
900 len++;
901 }
902 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
903 cc == L'\\' &&
904 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
905 {
906 if (buf)
907 {
908 *buf++ = (char) ((psz[0] - L'0') * 0100 +
909 (psz[1] - L'0') * 010 +
910 (psz[2] - L'0'));
911 }
912
913 psz += 3;
914 len++;
915 }
916 else
917 {
918 unsigned cnt;
919 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
920 {
921 }
922
923 if (!cnt)
924 {
925 // plain ASCII char
926 if (buf)
927 *buf++ = (char) cc;
928 len++;
929 }
930 else
931 {
932 len += cnt + 1;
933 if (buf)
934 {
935 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
936 while (cnt--)
937 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
938 }
939 }
940 }
941 }
942
943 if (buf && (len < n))
944 *buf = 0;
945
946 return len;
947 }
948
949 // ============================================================================
950 // UTF-16
951 // ============================================================================
952
953 #ifdef WORDS_BIGENDIAN
954 #define wxMBConvUTF16straight wxMBConvUTF16BE
955 #define wxMBConvUTF16swap wxMBConvUTF16LE
956 #else
957 #define wxMBConvUTF16swap wxMBConvUTF16BE
958 #define wxMBConvUTF16straight wxMBConvUTF16LE
959 #endif
960
961 /* static */
962 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
963 {
964 if ( srcLen == wxNO_LEN )
965 {
966 // count the number of bytes in input, including the trailing NULs
967 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
968 for ( srcLen = 1; *inBuff++; srcLen++ )
969 ;
970
971 srcLen *= BYTES_PER_CHAR;
972 }
973 else // we already have the length
974 {
975 // we can only convert an entire number of UTF-16 characters
976 if ( srcLen % BYTES_PER_CHAR )
977 return wxCONV_FAILED;
978 }
979
980 return srcLen;
981 }
982
983 // case when in-memory representation is UTF-16 too
984 #ifdef WC_UTF16
985
986 // ----------------------------------------------------------------------------
987 // conversions without endianness change
988 // ----------------------------------------------------------------------------
989
990 size_t
991 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
992 const char *src, size_t srcLen) const
993 {
994 // set up the scene for using memcpy() (which is presumably more efficient
995 // than copying the bytes one by one)
996 srcLen = GetLength(src, srcLen);
997 if ( srcLen == wxNO_LEN )
998 return wxCONV_FAILED;
999
1000 const size_t inLen = srcLen / BYTES_PER_CHAR;
1001 if ( dst )
1002 {
1003 if ( dstLen < inLen )
1004 return wxCONV_FAILED;
1005
1006 memcpy(dst, src, srcLen);
1007 }
1008
1009 return inLen;
1010 }
1011
1012 size_t
1013 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1014 const wchar_t *src, size_t srcLen) const
1015 {
1016 if ( srcLen == wxNO_LEN )
1017 srcLen = wxWcslen(src) + 1;
1018
1019 srcLen *= BYTES_PER_CHAR;
1020
1021 if ( dst )
1022 {
1023 if ( dstLen < srcLen )
1024 return wxCONV_FAILED;
1025
1026 memcpy(dst, src, srcLen);
1027 }
1028
1029 return srcLen;
1030 }
1031
1032 // ----------------------------------------------------------------------------
1033 // endian-reversing conversions
1034 // ----------------------------------------------------------------------------
1035
1036 size_t
1037 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1038 const char *src, size_t srcLen) const
1039 {
1040 srcLen = GetLength(src, srcLen);
1041 if ( srcLen == wxNO_LEN )
1042 return wxCONV_FAILED;
1043
1044 srcLen /= BYTES_PER_CHAR;
1045
1046 if ( dst )
1047 {
1048 if ( dstLen < srcLen )
1049 return wxCONV_FAILED;
1050
1051 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1052 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1053 {
1054 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1055 }
1056 }
1057
1058 return srcLen;
1059 }
1060
1061 size_t
1062 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1063 const wchar_t *src, size_t srcLen) const
1064 {
1065 if ( srcLen == wxNO_LEN )
1066 srcLen = wxWcslen(src) + 1;
1067
1068 srcLen *= BYTES_PER_CHAR;
1069
1070 if ( dst )
1071 {
1072 if ( dstLen < srcLen )
1073 return wxCONV_FAILED;
1074
1075 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1076 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1077 {
1078 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1079 }
1080 }
1081
1082 return srcLen;
1083 }
1084
1085 #else // !WC_UTF16: wchar_t is UTF-32
1086
1087 // ----------------------------------------------------------------------------
1088 // conversions without endianness change
1089 // ----------------------------------------------------------------------------
1090
1091 size_t
1092 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1093 const char *src, size_t srcLen) const
1094 {
1095 srcLen = GetLength(src, srcLen);
1096 if ( srcLen == wxNO_LEN )
1097 return wxCONV_FAILED;
1098
1099 const size_t inLen = srcLen / BYTES_PER_CHAR;
1100 if ( !dst )
1101 {
1102 // optimization: return maximal space which could be needed for this
1103 // string even if the real size could be smaller if the buffer contains
1104 // any surrogates
1105 return inLen;
1106 }
1107
1108 size_t outLen = 0;
1109 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1110 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1111 {
1112 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1113 if ( !inBuff )
1114 return wxCONV_FAILED;
1115
1116 if ( ++outLen > dstLen )
1117 return wxCONV_FAILED;
1118
1119 *dst++ = ch;
1120 }
1121
1122
1123 return outLen;
1124 }
1125
1126 size_t
1127 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1128 const wchar_t *src, size_t srcLen) const
1129 {
1130 if ( srcLen == wxNO_LEN )
1131 srcLen = wxWcslen(src) + 1;
1132
1133 size_t outLen = 0;
1134 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1135 for ( size_t n = 0; n < srcLen; n++ )
1136 {
1137 wxUint16 cc[2];
1138 const size_t numChars = encode_utf16(*src++, cc);
1139 if ( numChars == wxCONV_FAILED )
1140 return wxCONV_FAILED;
1141
1142 outLen += numChars * BYTES_PER_CHAR;
1143 if ( outBuff )
1144 {
1145 if ( outLen > dstLen )
1146 return wxCONV_FAILED;
1147
1148 *outBuff++ = cc[0];
1149 if ( numChars == 2 )
1150 {
1151 // second character of a surrogate
1152 *outBuff++ = cc[1];
1153 }
1154 }
1155 }
1156
1157 return outLen;
1158 }
1159
1160 // ----------------------------------------------------------------------------
1161 // endian-reversing conversions
1162 // ----------------------------------------------------------------------------
1163
1164 size_t
1165 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1166 const char *src, size_t srcLen) const
1167 {
1168 srcLen = GetLength(src, srcLen);
1169 if ( srcLen == wxNO_LEN )
1170 return wxCONV_FAILED;
1171
1172 const size_t inLen = srcLen / BYTES_PER_CHAR;
1173 if ( !dst )
1174 {
1175 // optimization: return maximal space which could be needed for this
1176 // string even if the real size could be smaller if the buffer contains
1177 // any surrogates
1178 return inLen;
1179 }
1180
1181 size_t outLen = 0;
1182 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1183 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1184 {
1185 wxUint32 ch;
1186 wxUint16 tmp[2];
1187
1188 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1189 inBuff++;
1190 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1191
1192 const size_t numChars = decode_utf16(tmp, ch);
1193 if ( numChars == wxCONV_FAILED )
1194 return wxCONV_FAILED;
1195
1196 if ( numChars == 2 )
1197 inBuff++;
1198
1199 if ( ++outLen > dstLen )
1200 return wxCONV_FAILED;
1201
1202 *dst++ = ch;
1203 }
1204
1205
1206 return outLen;
1207 }
1208
1209 size_t
1210 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1211 const wchar_t *src, size_t srcLen) const
1212 {
1213 if ( srcLen == wxNO_LEN )
1214 srcLen = wxWcslen(src) + 1;
1215
1216 size_t outLen = 0;
1217 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1218 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1219 {
1220 wxUint16 cc[2];
1221 const size_t numChars = encode_utf16(*src, cc);
1222 if ( numChars == wxCONV_FAILED )
1223 return wxCONV_FAILED;
1224
1225 outLen += numChars * BYTES_PER_CHAR;
1226 if ( outBuff )
1227 {
1228 if ( outLen > dstLen )
1229 return wxCONV_FAILED;
1230
1231 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1232 if ( numChars == 2 )
1233 {
1234 // second character of a surrogate
1235 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1236 }
1237 }
1238 }
1239
1240 return outLen;
1241 }
1242
1243 #endif // WC_UTF16/!WC_UTF16
1244
1245
1246 // ============================================================================
1247 // UTF-32
1248 // ============================================================================
1249
1250 #ifdef WORDS_BIGENDIAN
1251 #define wxMBConvUTF32straight wxMBConvUTF32BE
1252 #define wxMBConvUTF32swap wxMBConvUTF32LE
1253 #else
1254 #define wxMBConvUTF32swap wxMBConvUTF32BE
1255 #define wxMBConvUTF32straight wxMBConvUTF32LE
1256 #endif
1257
1258
1259 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1260 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1261
1262 /* static */
1263 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1264 {
1265 if ( srcLen == wxNO_LEN )
1266 {
1267 // count the number of bytes in input, including the trailing NULs
1268 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1269 for ( srcLen = 1; *inBuff++; srcLen++ )
1270 ;
1271
1272 srcLen *= BYTES_PER_CHAR;
1273 }
1274 else // we already have the length
1275 {
1276 // we can only convert an entire number of UTF-32 characters
1277 if ( srcLen % BYTES_PER_CHAR )
1278 return wxCONV_FAILED;
1279 }
1280
1281 return srcLen;
1282 }
1283
1284 // case when in-memory representation is UTF-16
1285 #ifdef WC_UTF16
1286
1287 // ----------------------------------------------------------------------------
1288 // conversions without endianness change
1289 // ----------------------------------------------------------------------------
1290
1291 size_t
1292 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1293 const char *src, size_t srcLen) const
1294 {
1295 srcLen = GetLength(src, srcLen);
1296 if ( srcLen == wxNO_LEN )
1297 return wxCONV_FAILED;
1298
1299 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1300 const size_t inLen = srcLen / BYTES_PER_CHAR;
1301 size_t outLen = 0;
1302 for ( size_t n = 0; n < inLen; n++ )
1303 {
1304 wxUint16 cc[2];
1305 const size_t numChars = encode_utf16(*inBuff++, cc);
1306 if ( numChars == wxCONV_FAILED )
1307 return wxCONV_FAILED;
1308
1309 outLen += numChars;
1310 if ( dst )
1311 {
1312 if ( outLen > dstLen )
1313 return wxCONV_FAILED;
1314
1315 *dst++ = cc[0];
1316 if ( numChars == 2 )
1317 {
1318 // second character of a surrogate
1319 *dst++ = cc[1];
1320 }
1321 }
1322 }
1323
1324 return outLen;
1325 }
1326
1327 size_t
1328 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1329 const wchar_t *src, size_t srcLen) const
1330 {
1331 if ( srcLen == wxNO_LEN )
1332 srcLen = wxWcslen(src) + 1;
1333
1334 if ( !dst )
1335 {
1336 // optimization: return maximal space which could be needed for this
1337 // string instead of the exact amount which could be less if there are
1338 // any surrogates in the input
1339 //
1340 // we consider that surrogates are rare enough to make it worthwhile to
1341 // avoid running the loop below at the cost of slightly extra memory
1342 // consumption
1343 return srcLen * BYTES_PER_CHAR;
1344 }
1345
1346 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1347 size_t outLen = 0;
1348 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1349 {
1350 const wxUint32 ch = wxDecodeSurrogate(&src);
1351 if ( !src )
1352 return wxCONV_FAILED;
1353
1354 outLen += BYTES_PER_CHAR;
1355
1356 if ( outLen > dstLen )
1357 return wxCONV_FAILED;
1358
1359 *outBuff++ = ch;
1360 }
1361
1362 return outLen;
1363 }
1364
1365 // ----------------------------------------------------------------------------
1366 // endian-reversing conversions
1367 // ----------------------------------------------------------------------------
1368
1369 size_t
1370 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1371 const char *src, size_t srcLen) const
1372 {
1373 srcLen = GetLength(src, srcLen);
1374 if ( srcLen == wxNO_LEN )
1375 return wxCONV_FAILED;
1376
1377 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1378 const size_t inLen = srcLen / BYTES_PER_CHAR;
1379 size_t outLen = 0;
1380 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1381 {
1382 wxUint16 cc[2];
1383 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1384 if ( numChars == wxCONV_FAILED )
1385 return wxCONV_FAILED;
1386
1387 outLen += numChars;
1388 if ( dst )
1389 {
1390 if ( outLen > dstLen )
1391 return wxCONV_FAILED;
1392
1393 *dst++ = cc[0];
1394 if ( numChars == 2 )
1395 {
1396 // second character of a surrogate
1397 *dst++ = cc[1];
1398 }
1399 }
1400 }
1401
1402 return outLen;
1403 }
1404
1405 size_t
1406 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1407 const wchar_t *src, size_t srcLen) const
1408 {
1409 if ( srcLen == wxNO_LEN )
1410 srcLen = wxWcslen(src) + 1;
1411
1412 if ( !dst )
1413 {
1414 // optimization: return maximal space which could be needed for this
1415 // string instead of the exact amount which could be less if there are
1416 // any surrogates in the input
1417 //
1418 // we consider that surrogates are rare enough to make it worthwhile to
1419 // avoid running the loop below at the cost of slightly extra memory
1420 // consumption
1421 return srcLen*BYTES_PER_CHAR;
1422 }
1423
1424 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1425 size_t outLen = 0;
1426 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1427 {
1428 const wxUint32 ch = wxDecodeSurrogate(&src);
1429 if ( !src )
1430 return wxCONV_FAILED;
1431
1432 outLen += BYTES_PER_CHAR;
1433
1434 if ( outLen > dstLen )
1435 return wxCONV_FAILED;
1436
1437 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1438 }
1439
1440 return outLen;
1441 }
1442
1443 #else // !WC_UTF16: wchar_t is UTF-32
1444
1445 // ----------------------------------------------------------------------------
1446 // conversions without endianness change
1447 // ----------------------------------------------------------------------------
1448
1449 size_t
1450 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1451 const char *src, size_t srcLen) const
1452 {
1453 // use memcpy() as it should be much faster than hand-written loop
1454 srcLen = GetLength(src, srcLen);
1455 if ( srcLen == wxNO_LEN )
1456 return wxCONV_FAILED;
1457
1458 const size_t inLen = srcLen/BYTES_PER_CHAR;
1459 if ( dst )
1460 {
1461 if ( dstLen < inLen )
1462 return wxCONV_FAILED;
1463
1464 memcpy(dst, src, srcLen);
1465 }
1466
1467 return inLen;
1468 }
1469
1470 size_t
1471 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1472 const wchar_t *src, size_t srcLen) const
1473 {
1474 if ( srcLen == wxNO_LEN )
1475 srcLen = wxWcslen(src) + 1;
1476
1477 srcLen *= BYTES_PER_CHAR;
1478
1479 if ( dst )
1480 {
1481 if ( dstLen < srcLen )
1482 return wxCONV_FAILED;
1483
1484 memcpy(dst, src, srcLen);
1485 }
1486
1487 return srcLen;
1488 }
1489
1490 // ----------------------------------------------------------------------------
1491 // endian-reversing conversions
1492 // ----------------------------------------------------------------------------
1493
1494 size_t
1495 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1496 const char *src, size_t srcLen) const
1497 {
1498 srcLen = GetLength(src, srcLen);
1499 if ( srcLen == wxNO_LEN )
1500 return wxCONV_FAILED;
1501
1502 srcLen /= BYTES_PER_CHAR;
1503
1504 if ( dst )
1505 {
1506 if ( dstLen < srcLen )
1507 return wxCONV_FAILED;
1508
1509 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1510 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1511 {
1512 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1513 }
1514 }
1515
1516 return srcLen;
1517 }
1518
1519 size_t
1520 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1521 const wchar_t *src, size_t srcLen) const
1522 {
1523 if ( srcLen == wxNO_LEN )
1524 srcLen = wxWcslen(src) + 1;
1525
1526 srcLen *= BYTES_PER_CHAR;
1527
1528 if ( dst )
1529 {
1530 if ( dstLen < srcLen )
1531 return wxCONV_FAILED;
1532
1533 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1534 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1535 {
1536 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1537 }
1538 }
1539
1540 return srcLen;
1541 }
1542
1543 #endif // WC_UTF16/!WC_UTF16
1544
1545
1546 // ============================================================================
1547 // The classes doing conversion using the iconv_xxx() functions
1548 // ============================================================================
1549
1550 #ifdef HAVE_ICONV
1551
1552 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1553 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1554 // (unless there's yet another bug in glibc) the only case when iconv()
1555 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1556 // left in the input buffer -- when _real_ error occurs,
1557 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1558 // iconv() failure.
1559 // [This bug does not appear in glibc 2.2.]
1560 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1561 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1562 (errno != E2BIG || bufLeft != 0))
1563 #else
1564 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1565 #endif
1566
1567 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1568
1569 #define ICONV_T_INVALID ((iconv_t)-1)
1570
1571 #if SIZEOF_WCHAR_T == 4
1572 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1573 #define WC_ENC wxFONTENCODING_UTF32
1574 #elif SIZEOF_WCHAR_T == 2
1575 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1576 #define WC_ENC wxFONTENCODING_UTF16
1577 #else // sizeof(wchar_t) != 2 nor 4
1578 // does this ever happen?
1579 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1580 #endif
1581
1582 // ----------------------------------------------------------------------------
1583 // wxMBConv_iconv: encapsulates an iconv character set
1584 // ----------------------------------------------------------------------------
1585
1586 class wxMBConv_iconv : public wxMBConv
1587 {
1588 public:
1589 wxMBConv_iconv(const wxChar *name);
1590 virtual ~wxMBConv_iconv();
1591
1592 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1593 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1594
1595 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1596 virtual size_t GetMBNulLen() const;
1597
1598 virtual wxMBConv *Clone() const
1599 {
1600 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1601 p->m_minMBCharWidth = m_minMBCharWidth;
1602 return p;
1603 }
1604
1605 bool IsOk() const
1606 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1607
1608 protected:
1609 // the iconv handlers used to translate from multibyte
1610 // to wide char and in the other direction
1611 iconv_t m2w,
1612 w2m;
1613
1614 #if wxUSE_THREADS
1615 // guards access to m2w and w2m objects
1616 wxMutex m_iconvMutex;
1617 #endif
1618
1619 private:
1620 // the name (for iconv_open()) of a wide char charset -- if none is
1621 // available on this machine, it will remain NULL
1622 static wxString ms_wcCharsetName;
1623
1624 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1625 // different endian-ness than the native one
1626 static bool ms_wcNeedsSwap;
1627
1628
1629 // name of the encoding handled by this conversion
1630 wxString m_name;
1631
1632 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1633 // initially
1634 size_t m_minMBCharWidth;
1635 };
1636
1637 // make the constructor available for unit testing
1638 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1639 {
1640 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1641 if ( !result->IsOk() )
1642 {
1643 delete result;
1644 return 0;
1645 }
1646
1647 return result;
1648 }
1649
1650 wxString wxMBConv_iconv::ms_wcCharsetName;
1651 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1652
1653 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1654 : m_name(name)
1655 {
1656 m_minMBCharWidth = 0;
1657
1658 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1659 // names for the charsets
1660 const wxCharBuffer cname(wxString(name).ToAscii());
1661
1662 // check for charset that represents wchar_t:
1663 if ( ms_wcCharsetName.empty() )
1664 {
1665 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1666
1667 #if wxUSE_FONTMAP
1668 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1669 #else // !wxUSE_FONTMAP
1670 static const wxChar *names[] =
1671 {
1672 #if SIZEOF_WCHAR_T == 4
1673 _T("UCS-4"),
1674 #elif SIZEOF_WCHAR_T = 2
1675 _T("UCS-2"),
1676 #endif
1677 NULL
1678 };
1679 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1680
1681 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1682 {
1683 const wxString nameCS(*names);
1684
1685 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1686 wxString nameXE(nameCS);
1687
1688 #ifdef WORDS_BIGENDIAN
1689 nameXE += _T("BE");
1690 #else // little endian
1691 nameXE += _T("LE");
1692 #endif
1693
1694 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1695 nameXE.c_str());
1696
1697 m2w = iconv_open(nameXE.ToAscii(), cname);
1698 if ( m2w == ICONV_T_INVALID )
1699 {
1700 // try charset w/o bytesex info (e.g. "UCS4")
1701 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1702 nameCS.c_str());
1703 m2w = iconv_open(nameCS.ToAscii(), cname);
1704
1705 // and check for bytesex ourselves:
1706 if ( m2w != ICONV_T_INVALID )
1707 {
1708 char buf[2], *bufPtr;
1709 wchar_t wbuf[2], *wbufPtr;
1710 size_t insz, outsz;
1711 size_t res;
1712
1713 buf[0] = 'A';
1714 buf[1] = 0;
1715 wbuf[0] = 0;
1716 insz = 2;
1717 outsz = SIZEOF_WCHAR_T * 2;
1718 wbufPtr = wbuf;
1719 bufPtr = buf;
1720
1721 res = iconv(
1722 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1723 (char**)&wbufPtr, &outsz);
1724
1725 if (ICONV_FAILED(res, insz))
1726 {
1727 wxLogLastError(wxT("iconv"));
1728 wxLogError(_("Conversion to charset '%s' doesn't work."),
1729 nameCS.c_str());
1730 }
1731 else // ok, can convert to this encoding, remember it
1732 {
1733 ms_wcCharsetName = nameCS;
1734 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1735 }
1736 }
1737 }
1738 else // use charset not requiring byte swapping
1739 {
1740 ms_wcCharsetName = nameXE;
1741 }
1742 }
1743
1744 wxLogTrace(TRACE_STRCONV,
1745 wxT("iconv wchar_t charset is \"%s\"%s"),
1746 ms_wcCharsetName.empty() ? _T("<none>")
1747 : ms_wcCharsetName.c_str(),
1748 ms_wcNeedsSwap ? _T(" (needs swap)")
1749 : _T(""));
1750 }
1751 else // we already have ms_wcCharsetName
1752 {
1753 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1754 }
1755
1756 if ( ms_wcCharsetName.empty() )
1757 {
1758 w2m = ICONV_T_INVALID;
1759 }
1760 else
1761 {
1762 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1763 if ( w2m == ICONV_T_INVALID )
1764 {
1765 wxLogTrace(TRACE_STRCONV,
1766 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1767 ms_wcCharsetName.c_str(), cname.data());
1768 }
1769 }
1770 }
1771
1772 wxMBConv_iconv::~wxMBConv_iconv()
1773 {
1774 if ( m2w != ICONV_T_INVALID )
1775 iconv_close(m2w);
1776 if ( w2m != ICONV_T_INVALID )
1777 iconv_close(w2m);
1778 }
1779
1780 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1781 {
1782 // find the string length: notice that must be done differently for
1783 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1784 size_t inbuf;
1785 const size_t nulLen = GetMBNulLen();
1786 switch ( nulLen )
1787 {
1788 default:
1789 return wxCONV_FAILED;
1790
1791 case 1:
1792 inbuf = strlen(psz); // arguably more optimized than our version
1793 break;
1794
1795 case 2:
1796 case 4:
1797 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1798 // they also have to start at character boundary and not span two
1799 // adjacent characters
1800 const char *p;
1801 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1802 ;
1803 inbuf = p - psz;
1804 break;
1805 }
1806
1807 #if wxUSE_THREADS
1808 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1809 // Unfortunately there is a couple of global wxCSConv objects such as
1810 // wxConvLocal that are used all over wx code, so we have to make sure
1811 // the handle is used by at most one thread at the time. Otherwise
1812 // only a few wx classes would be safe to use from non-main threads
1813 // as MB<->WC conversion would fail "randomly".
1814 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1815 #endif // wxUSE_THREADS
1816
1817 size_t outbuf = n * SIZEOF_WCHAR_T;
1818 size_t res, cres;
1819 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1820 wchar_t *bufPtr = buf;
1821 const char *pszPtr = psz;
1822
1823 if (buf)
1824 {
1825 // have destination buffer, convert there
1826 cres = iconv(m2w,
1827 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1828 (char**)&bufPtr, &outbuf);
1829 res = n - (outbuf / SIZEOF_WCHAR_T);
1830
1831 if (ms_wcNeedsSwap)
1832 {
1833 // convert to native endianness
1834 for ( unsigned i = 0; i < res; i++ )
1835 buf[n] = WC_BSWAP(buf[i]);
1836 }
1837
1838 // NUL-terminate the string if there is any space left
1839 if (res < n)
1840 buf[res] = 0;
1841 }
1842 else
1843 {
1844 // no destination buffer... convert using temp buffer
1845 // to calculate destination buffer requirement
1846 wchar_t tbuf[8];
1847 res = 0;
1848
1849 do
1850 {
1851 bufPtr = tbuf;
1852 outbuf = 8 * SIZEOF_WCHAR_T;
1853
1854 cres = iconv(m2w,
1855 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1856 (char**)&bufPtr, &outbuf );
1857
1858 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1859 }
1860 while ((cres == (size_t)-1) && (errno == E2BIG));
1861 }
1862
1863 if (ICONV_FAILED(cres, inbuf))
1864 {
1865 //VS: it is ok if iconv fails, hence trace only
1866 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1867 return wxCONV_FAILED;
1868 }
1869
1870 return res;
1871 }
1872
1873 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1874 {
1875 #if wxUSE_THREADS
1876 // NB: explained in MB2WC
1877 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1878 #endif
1879
1880 size_t inlen = wxWcslen(psz);
1881 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1882 size_t outbuf = n;
1883 size_t res, cres;
1884
1885 wchar_t *tmpbuf = 0;
1886
1887 if (ms_wcNeedsSwap)
1888 {
1889 // need to copy to temp buffer to switch endianness
1890 // (doing WC_BSWAP twice on the original buffer won't help, as it
1891 // could be in read-only memory, or be accessed in some other thread)
1892 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1893 for ( size_t i = 0; i < inlen; i++ )
1894 tmpbuf[n] = WC_BSWAP(psz[i]);
1895
1896 tmpbuf[inlen] = L'\0';
1897 psz = tmpbuf;
1898 }
1899
1900 if (buf)
1901 {
1902 // have destination buffer, convert there
1903 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1904
1905 res = n - outbuf;
1906
1907 // NB: iconv was given only wcslen(psz) characters on input, and so
1908 // it couldn't convert the trailing zero. Let's do it ourselves
1909 // if there's some room left for it in the output buffer.
1910 if (res < n)
1911 buf[0] = 0;
1912 }
1913 else
1914 {
1915 // no destination buffer: convert using temp buffer
1916 // to calculate destination buffer requirement
1917 char tbuf[16];
1918 res = 0;
1919 do
1920 {
1921 buf = tbuf;
1922 outbuf = 16;
1923
1924 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1925
1926 res += 16 - outbuf;
1927 }
1928 while ((cres == (size_t)-1) && (errno == E2BIG));
1929 }
1930
1931 if (ms_wcNeedsSwap)
1932 {
1933 free(tmpbuf);
1934 }
1935
1936 if (ICONV_FAILED(cres, inbuf))
1937 {
1938 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1939 return wxCONV_FAILED;
1940 }
1941
1942 return res;
1943 }
1944
1945 size_t wxMBConv_iconv::GetMBNulLen() const
1946 {
1947 if ( m_minMBCharWidth == 0 )
1948 {
1949 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1950
1951 #if wxUSE_THREADS
1952 // NB: explained in MB2WC
1953 wxMutexLocker lock(self->m_iconvMutex);
1954 #endif
1955
1956 wchar_t *wnul = L"";
1957 char buf[8]; // should be enough for NUL in any encoding
1958 size_t inLen = sizeof(wchar_t),
1959 outLen = WXSIZEOF(buf);
1960 char *inBuff = (char *)wnul;
1961 char *outBuff = buf;
1962 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1963 {
1964 self->m_minMBCharWidth = (size_t)-1;
1965 }
1966 else // ok
1967 {
1968 self->m_minMBCharWidth = outBuff - buf;
1969 }
1970 }
1971
1972 return m_minMBCharWidth;
1973 }
1974
1975 #endif // HAVE_ICONV
1976
1977
1978 // ============================================================================
1979 // Win32 conversion classes
1980 // ============================================================================
1981
1982 #ifdef wxHAVE_WIN32_MB2WC
1983
1984 // from utils.cpp
1985 #if wxUSE_FONTMAP
1986 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1987 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1988 #endif
1989
1990 class wxMBConv_win32 : public wxMBConv
1991 {
1992 public:
1993 wxMBConv_win32()
1994 {
1995 m_CodePage = CP_ACP;
1996 m_minMBCharWidth = 0;
1997 }
1998
1999 wxMBConv_win32(const wxMBConv_win32& conv)
2000 : wxMBConv()
2001 {
2002 m_CodePage = conv.m_CodePage;
2003 m_minMBCharWidth = conv.m_minMBCharWidth;
2004 }
2005
2006 #if wxUSE_FONTMAP
2007 wxMBConv_win32(const wxChar* name)
2008 {
2009 m_CodePage = wxCharsetToCodepage(name);
2010 m_minMBCharWidth = 0;
2011 }
2012
2013 wxMBConv_win32(wxFontEncoding encoding)
2014 {
2015 m_CodePage = wxEncodingToCodepage(encoding);
2016 m_minMBCharWidth = 0;
2017 }
2018 #endif // wxUSE_FONTMAP
2019
2020 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2021 {
2022 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2023 // the behaviour is not compatible with the Unix version (using iconv)
2024 // and break the library itself, e.g. wxTextInputStream::NextChar()
2025 // wouldn't work if reading an incomplete MB char didn't result in an
2026 // error
2027 //
2028 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2029 // Win XP or newer and it is not supported for UTF-[78] so we always
2030 // use our own conversions in this case. See
2031 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2032 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2033 if ( m_CodePage == CP_UTF8 )
2034 {
2035 return wxConvUTF8.MB2WC(buf, psz, n);
2036 }
2037
2038 if ( m_CodePage == CP_UTF7 )
2039 {
2040 return wxConvUTF7.MB2WC(buf, psz, n);
2041 }
2042
2043 int flags = 0;
2044 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2045 IsAtLeastWin2kSP4() )
2046 {
2047 flags = MB_ERR_INVALID_CHARS;
2048 }
2049
2050 const size_t len = ::MultiByteToWideChar
2051 (
2052 m_CodePage, // code page
2053 flags, // flags: fall on error
2054 psz, // input string
2055 -1, // its length (NUL-terminated)
2056 buf, // output string
2057 buf ? n : 0 // size of output buffer
2058 );
2059 if ( !len )
2060 {
2061 // function totally failed
2062 return wxCONV_FAILED;
2063 }
2064
2065 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2066 // check if we succeeded, by doing a double trip:
2067 if ( !flags && buf )
2068 {
2069 const size_t mbLen = strlen(psz);
2070 wxCharBuffer mbBuf(mbLen);
2071 if ( ::WideCharToMultiByte
2072 (
2073 m_CodePage,
2074 0,
2075 buf,
2076 -1,
2077 mbBuf.data(),
2078 mbLen + 1, // size in bytes, not length
2079 NULL,
2080 NULL
2081 ) == 0 ||
2082 strcmp(mbBuf, psz) != 0 )
2083 {
2084 // we didn't obtain the same thing we started from, hence
2085 // the conversion was lossy and we consider that it failed
2086 return wxCONV_FAILED;
2087 }
2088 }
2089
2090 // note that it returns count of written chars for buf != NULL and size
2091 // of the needed buffer for buf == NULL so in either case the length of
2092 // the string (which never includes the terminating NUL) is one less
2093 return len - 1;
2094 }
2095
2096 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2097 {
2098 /*
2099 we have a problem here: by default, WideCharToMultiByte() may
2100 replace characters unrepresentable in the target code page with bad
2101 quality approximations such as turning "1/2" symbol (U+00BD) into
2102 "1" for the code pages which don't have it and we, obviously, want
2103 to avoid this at any price
2104
2105 the trouble is that this function does it _silently_, i.e. it won't
2106 even tell us whether it did or not... Win98/2000 and higher provide
2107 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2108 we have to resort to a round trip, i.e. check that converting back
2109 results in the same string -- this is, of course, expensive but
2110 otherwise we simply can't be sure to not garble the data.
2111 */
2112
2113 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2114 // it doesn't work with CJK encodings (which we test for rather roughly
2115 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2116 // supporting it
2117 BOOL usedDef wxDUMMY_INITIALIZE(false);
2118 BOOL *pUsedDef;
2119 int flags;
2120 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2121 {
2122 // it's our lucky day
2123 flags = WC_NO_BEST_FIT_CHARS;
2124 pUsedDef = &usedDef;
2125 }
2126 else // old system or unsupported encoding
2127 {
2128 flags = 0;
2129 pUsedDef = NULL;
2130 }
2131
2132 const size_t len = ::WideCharToMultiByte
2133 (
2134 m_CodePage, // code page
2135 flags, // either none or no best fit
2136 pwz, // input string
2137 -1, // it is (wide) NUL-terminated
2138 buf, // output buffer
2139 buf ? n : 0, // and its size
2140 NULL, // default "replacement" char
2141 pUsedDef // [out] was it used?
2142 );
2143
2144 if ( !len )
2145 {
2146 // function totally failed
2147 return wxCONV_FAILED;
2148 }
2149
2150 // if we were really converting, check if we succeeded
2151 if ( buf )
2152 {
2153 if ( flags )
2154 {
2155 // check if the conversion failed, i.e. if any replacements
2156 // were done
2157 if ( usedDef )
2158 return wxCONV_FAILED;
2159 }
2160 else // we must resort to double tripping...
2161 {
2162 wxWCharBuffer wcBuf(n);
2163 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2164 wcscmp(wcBuf, pwz) != 0 )
2165 {
2166 // we didn't obtain the same thing we started from, hence
2167 // the conversion was lossy and we consider that it failed
2168 return wxCONV_FAILED;
2169 }
2170 }
2171 }
2172
2173 // see the comment above for the reason of "len - 1"
2174 return len - 1;
2175 }
2176
2177 virtual size_t GetMBNulLen() const
2178 {
2179 if ( m_minMBCharWidth == 0 )
2180 {
2181 int len = ::WideCharToMultiByte
2182 (
2183 m_CodePage, // code page
2184 0, // no flags
2185 L"", // input string
2186 1, // translate just the NUL
2187 NULL, // output buffer
2188 0, // and its size
2189 NULL, // no replacement char
2190 NULL // [out] don't care if it was used
2191 );
2192
2193 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2194 switch ( len )
2195 {
2196 default:
2197 wxLogDebug(_T("Unexpected NUL length %d"), len);
2198 self->m_minMBCharWidth = (size_t)-1;
2199 break;
2200
2201 case 0:
2202 self->m_minMBCharWidth = (size_t)-1;
2203 break;
2204
2205 case 1:
2206 case 2:
2207 case 4:
2208 self->m_minMBCharWidth = len;
2209 break;
2210 }
2211 }
2212
2213 return m_minMBCharWidth;
2214 }
2215
2216 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2217
2218 bool IsOk() const { return m_CodePage != -1; }
2219
2220 private:
2221 static bool CanUseNoBestFit()
2222 {
2223 static int s_isWin98Or2k = -1;
2224
2225 if ( s_isWin98Or2k == -1 )
2226 {
2227 int verMaj, verMin;
2228 switch ( wxGetOsVersion(&verMaj, &verMin) )
2229 {
2230 case wxWIN95:
2231 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2232 break;
2233
2234 case wxWINDOWS_NT:
2235 s_isWin98Or2k = verMaj >= 5;
2236 break;
2237
2238 default:
2239 // unknown: be conservative by default
2240 s_isWin98Or2k = 0;
2241 break;
2242 }
2243
2244 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2245 }
2246
2247 return s_isWin98Or2k == 1;
2248 }
2249
2250 static bool IsAtLeastWin2kSP4()
2251 {
2252 #ifdef __WXWINCE__
2253 return false;
2254 #else
2255 static int s_isAtLeastWin2kSP4 = -1;
2256
2257 if ( s_isAtLeastWin2kSP4 == -1 )
2258 {
2259 OSVERSIONINFOEX ver;
2260
2261 memset(&ver, 0, sizeof(ver));
2262 ver.dwOSVersionInfoSize = sizeof(ver);
2263 GetVersionEx((OSVERSIONINFO*)&ver);
2264
2265 s_isAtLeastWin2kSP4 =
2266 ((ver.dwMajorVersion > 5) || // Vista+
2267 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2268 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2269 ver.wServicePackMajor >= 4)) // 2000 SP4+
2270 ? 1 : 0;
2271 }
2272
2273 return s_isAtLeastWin2kSP4 == 1;
2274 #endif
2275 }
2276
2277
2278 // the code page we're working with
2279 long m_CodePage;
2280
2281 // cached result of GetMBNulLen(), set to 0 initially meaning
2282 // "unknown"
2283 size_t m_minMBCharWidth;
2284 };
2285
2286 #endif // wxHAVE_WIN32_MB2WC
2287
2288 // ============================================================================
2289 // Cocoa conversion classes
2290 // ============================================================================
2291
2292 #if defined(__WXCOCOA__)
2293
2294 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2295 // Strangely enough, internally Core Foundation uses
2296 // UTF-32 internally quite a bit - its just not public (yet).
2297
2298 #include <CoreFoundation/CFString.h>
2299 #include <CoreFoundation/CFStringEncodingExt.h>
2300
2301 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2302 {
2303 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2304
2305 switch (encoding)
2306 {
2307 case wxFONTENCODING_DEFAULT :
2308 enc = CFStringGetSystemEncoding();
2309 break ;
2310
2311 case wxFONTENCODING_ISO8859_1 :
2312 enc = kCFStringEncodingISOLatin1 ;
2313 break ;
2314 case wxFONTENCODING_ISO8859_2 :
2315 enc = kCFStringEncodingISOLatin2;
2316 break ;
2317 case wxFONTENCODING_ISO8859_3 :
2318 enc = kCFStringEncodingISOLatin3 ;
2319 break ;
2320 case wxFONTENCODING_ISO8859_4 :
2321 enc = kCFStringEncodingISOLatin4;
2322 break ;
2323 case wxFONTENCODING_ISO8859_5 :
2324 enc = kCFStringEncodingISOLatinCyrillic;
2325 break ;
2326 case wxFONTENCODING_ISO8859_6 :
2327 enc = kCFStringEncodingISOLatinArabic;
2328 break ;
2329 case wxFONTENCODING_ISO8859_7 :
2330 enc = kCFStringEncodingISOLatinGreek;
2331 break ;
2332 case wxFONTENCODING_ISO8859_8 :
2333 enc = kCFStringEncodingISOLatinHebrew;
2334 break ;
2335 case wxFONTENCODING_ISO8859_9 :
2336 enc = kCFStringEncodingISOLatin5;
2337 break ;
2338 case wxFONTENCODING_ISO8859_10 :
2339 enc = kCFStringEncodingISOLatin6;
2340 break ;
2341 case wxFONTENCODING_ISO8859_11 :
2342 enc = kCFStringEncodingISOLatinThai;
2343 break ;
2344 case wxFONTENCODING_ISO8859_13 :
2345 enc = kCFStringEncodingISOLatin7;
2346 break ;
2347 case wxFONTENCODING_ISO8859_14 :
2348 enc = kCFStringEncodingISOLatin8;
2349 break ;
2350 case wxFONTENCODING_ISO8859_15 :
2351 enc = kCFStringEncodingISOLatin9;
2352 break ;
2353
2354 case wxFONTENCODING_KOI8 :
2355 enc = kCFStringEncodingKOI8_R;
2356 break ;
2357 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2358 enc = kCFStringEncodingDOSRussian;
2359 break ;
2360
2361 // case wxFONTENCODING_BULGARIAN :
2362 // enc = ;
2363 // break ;
2364
2365 case wxFONTENCODING_CP437 :
2366 enc = kCFStringEncodingDOSLatinUS ;
2367 break ;
2368 case wxFONTENCODING_CP850 :
2369 enc = kCFStringEncodingDOSLatin1;
2370 break ;
2371 case wxFONTENCODING_CP852 :
2372 enc = kCFStringEncodingDOSLatin2;
2373 break ;
2374 case wxFONTENCODING_CP855 :
2375 enc = kCFStringEncodingDOSCyrillic;
2376 break ;
2377 case wxFONTENCODING_CP866 :
2378 enc = kCFStringEncodingDOSRussian ;
2379 break ;
2380 case wxFONTENCODING_CP874 :
2381 enc = kCFStringEncodingDOSThai;
2382 break ;
2383 case wxFONTENCODING_CP932 :
2384 enc = kCFStringEncodingDOSJapanese;
2385 break ;
2386 case wxFONTENCODING_CP936 :
2387 enc = kCFStringEncodingDOSChineseSimplif ;
2388 break ;
2389 case wxFONTENCODING_CP949 :
2390 enc = kCFStringEncodingDOSKorean;
2391 break ;
2392 case wxFONTENCODING_CP950 :
2393 enc = kCFStringEncodingDOSChineseTrad;
2394 break ;
2395 case wxFONTENCODING_CP1250 :
2396 enc = kCFStringEncodingWindowsLatin2;
2397 break ;
2398 case wxFONTENCODING_CP1251 :
2399 enc = kCFStringEncodingWindowsCyrillic ;
2400 break ;
2401 case wxFONTENCODING_CP1252 :
2402 enc = kCFStringEncodingWindowsLatin1 ;
2403 break ;
2404 case wxFONTENCODING_CP1253 :
2405 enc = kCFStringEncodingWindowsGreek;
2406 break ;
2407 case wxFONTENCODING_CP1254 :
2408 enc = kCFStringEncodingWindowsLatin5;
2409 break ;
2410 case wxFONTENCODING_CP1255 :
2411 enc = kCFStringEncodingWindowsHebrew ;
2412 break ;
2413 case wxFONTENCODING_CP1256 :
2414 enc = kCFStringEncodingWindowsArabic ;
2415 break ;
2416 case wxFONTENCODING_CP1257 :
2417 enc = kCFStringEncodingWindowsBalticRim;
2418 break ;
2419 // This only really encodes to UTF7 (if that) evidently
2420 // case wxFONTENCODING_UTF7 :
2421 // enc = kCFStringEncodingNonLossyASCII ;
2422 // break ;
2423 case wxFONTENCODING_UTF8 :
2424 enc = kCFStringEncodingUTF8 ;
2425 break ;
2426 case wxFONTENCODING_EUC_JP :
2427 enc = kCFStringEncodingEUC_JP;
2428 break ;
2429 case wxFONTENCODING_UTF16 :
2430 enc = kCFStringEncodingUnicode ;
2431 break ;
2432 case wxFONTENCODING_MACROMAN :
2433 enc = kCFStringEncodingMacRoman ;
2434 break ;
2435 case wxFONTENCODING_MACJAPANESE :
2436 enc = kCFStringEncodingMacJapanese ;
2437 break ;
2438 case wxFONTENCODING_MACCHINESETRAD :
2439 enc = kCFStringEncodingMacChineseTrad ;
2440 break ;
2441 case wxFONTENCODING_MACKOREAN :
2442 enc = kCFStringEncodingMacKorean ;
2443 break ;
2444 case wxFONTENCODING_MACARABIC :
2445 enc = kCFStringEncodingMacArabic ;
2446 break ;
2447 case wxFONTENCODING_MACHEBREW :
2448 enc = kCFStringEncodingMacHebrew ;
2449 break ;
2450 case wxFONTENCODING_MACGREEK :
2451 enc = kCFStringEncodingMacGreek ;
2452 break ;
2453 case wxFONTENCODING_MACCYRILLIC :
2454 enc = kCFStringEncodingMacCyrillic ;
2455 break ;
2456 case wxFONTENCODING_MACDEVANAGARI :
2457 enc = kCFStringEncodingMacDevanagari ;
2458 break ;
2459 case wxFONTENCODING_MACGURMUKHI :
2460 enc = kCFStringEncodingMacGurmukhi ;
2461 break ;
2462 case wxFONTENCODING_MACGUJARATI :
2463 enc = kCFStringEncodingMacGujarati ;
2464 break ;
2465 case wxFONTENCODING_MACORIYA :
2466 enc = kCFStringEncodingMacOriya ;
2467 break ;
2468 case wxFONTENCODING_MACBENGALI :
2469 enc = kCFStringEncodingMacBengali ;
2470 break ;
2471 case wxFONTENCODING_MACTAMIL :
2472 enc = kCFStringEncodingMacTamil ;
2473 break ;
2474 case wxFONTENCODING_MACTELUGU :
2475 enc = kCFStringEncodingMacTelugu ;
2476 break ;
2477 case wxFONTENCODING_MACKANNADA :
2478 enc = kCFStringEncodingMacKannada ;
2479 break ;
2480 case wxFONTENCODING_MACMALAJALAM :
2481 enc = kCFStringEncodingMacMalayalam ;
2482 break ;
2483 case wxFONTENCODING_MACSINHALESE :
2484 enc = kCFStringEncodingMacSinhalese ;
2485 break ;
2486 case wxFONTENCODING_MACBURMESE :
2487 enc = kCFStringEncodingMacBurmese ;
2488 break ;
2489 case wxFONTENCODING_MACKHMER :
2490 enc = kCFStringEncodingMacKhmer ;
2491 break ;
2492 case wxFONTENCODING_MACTHAI :
2493 enc = kCFStringEncodingMacThai ;
2494 break ;
2495 case wxFONTENCODING_MACLAOTIAN :
2496 enc = kCFStringEncodingMacLaotian ;
2497 break ;
2498 case wxFONTENCODING_MACGEORGIAN :
2499 enc = kCFStringEncodingMacGeorgian ;
2500 break ;
2501 case wxFONTENCODING_MACARMENIAN :
2502 enc = kCFStringEncodingMacArmenian ;
2503 break ;
2504 case wxFONTENCODING_MACCHINESESIMP :
2505 enc = kCFStringEncodingMacChineseSimp ;
2506 break ;
2507 case wxFONTENCODING_MACTIBETAN :
2508 enc = kCFStringEncodingMacTibetan ;
2509 break ;
2510 case wxFONTENCODING_MACMONGOLIAN :
2511 enc = kCFStringEncodingMacMongolian ;
2512 break ;
2513 case wxFONTENCODING_MACETHIOPIC :
2514 enc = kCFStringEncodingMacEthiopic ;
2515 break ;
2516 case wxFONTENCODING_MACCENTRALEUR :
2517 enc = kCFStringEncodingMacCentralEurRoman ;
2518 break ;
2519 case wxFONTENCODING_MACVIATNAMESE :
2520 enc = kCFStringEncodingMacVietnamese ;
2521 break ;
2522 case wxFONTENCODING_MACARABICEXT :
2523 enc = kCFStringEncodingMacExtArabic ;
2524 break ;
2525 case wxFONTENCODING_MACSYMBOL :
2526 enc = kCFStringEncodingMacSymbol ;
2527 break ;
2528 case wxFONTENCODING_MACDINGBATS :
2529 enc = kCFStringEncodingMacDingbats ;
2530 break ;
2531 case wxFONTENCODING_MACTURKISH :
2532 enc = kCFStringEncodingMacTurkish ;
2533 break ;
2534 case wxFONTENCODING_MACCROATIAN :
2535 enc = kCFStringEncodingMacCroatian ;
2536 break ;
2537 case wxFONTENCODING_MACICELANDIC :
2538 enc = kCFStringEncodingMacIcelandic ;
2539 break ;
2540 case wxFONTENCODING_MACROMANIAN :
2541 enc = kCFStringEncodingMacRomanian ;
2542 break ;
2543 case wxFONTENCODING_MACCELTIC :
2544 enc = kCFStringEncodingMacCeltic ;
2545 break ;
2546 case wxFONTENCODING_MACGAELIC :
2547 enc = kCFStringEncodingMacGaelic ;
2548 break ;
2549 // case wxFONTENCODING_MACKEYBOARD :
2550 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2551 // break ;
2552
2553 default :
2554 // because gcc is picky
2555 break ;
2556 }
2557
2558 return enc ;
2559 }
2560
2561 class wxMBConv_cocoa : public wxMBConv
2562 {
2563 public:
2564 wxMBConv_cocoa()
2565 {
2566 Init(CFStringGetSystemEncoding()) ;
2567 }
2568
2569 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2570 {
2571 m_encoding = conv.m_encoding;
2572 }
2573
2574 #if wxUSE_FONTMAP
2575 wxMBConv_cocoa(const wxChar* name)
2576 {
2577 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2578 }
2579 #endif
2580
2581 wxMBConv_cocoa(wxFontEncoding encoding)
2582 {
2583 Init( wxCFStringEncFromFontEnc(encoding) );
2584 }
2585
2586 ~wxMBConv_cocoa()
2587 {
2588 }
2589
2590 void Init( CFStringEncoding encoding)
2591 {
2592 m_encoding = encoding ;
2593 }
2594
2595 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2596 {
2597 wxASSERT(szUnConv);
2598
2599 CFStringRef theString = CFStringCreateWithBytes (
2600 NULL, //the allocator
2601 (const UInt8*)szUnConv,
2602 strlen(szUnConv),
2603 m_encoding,
2604 false //no BOM/external representation
2605 );
2606
2607 wxASSERT(theString);
2608
2609 size_t nOutLength = CFStringGetLength(theString);
2610
2611 if (szOut == NULL)
2612 {
2613 CFRelease(theString);
2614 return nOutLength;
2615 }
2616
2617 CFRange theRange = { 0, nOutSize };
2618
2619 #if SIZEOF_WCHAR_T == 4
2620 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2621 #endif
2622
2623 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2624
2625 CFRelease(theString);
2626
2627 szUniCharBuffer[nOutLength] = '\0';
2628
2629 #if SIZEOF_WCHAR_T == 4
2630 wxMBConvUTF16 converter;
2631 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2632 delete [] szUniCharBuffer;
2633 #endif
2634
2635 return nOutLength;
2636 }
2637
2638 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2639 {
2640 wxASSERT(szUnConv);
2641
2642 size_t nRealOutSize;
2643 size_t nBufSize = wxWcslen(szUnConv);
2644 UniChar* szUniBuffer = (UniChar*) szUnConv;
2645
2646 #if SIZEOF_WCHAR_T == 4
2647 wxMBConvUTF16 converter ;
2648 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2649 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2650 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2651 nBufSize /= sizeof(UniChar);
2652 #endif
2653
2654 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2655 NULL, //allocator
2656 szUniBuffer,
2657 nBufSize,
2658 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2659 );
2660
2661 wxASSERT(theString);
2662
2663 //Note that CER puts a BOM when converting to unicode
2664 //so we check and use getchars instead in that case
2665 if (m_encoding == kCFStringEncodingUnicode)
2666 {
2667 if (szOut != NULL)
2668 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2669
2670 nRealOutSize = CFStringGetLength(theString) + 1;
2671 }
2672 else
2673 {
2674 CFStringGetBytes(
2675 theString,
2676 CFRangeMake(0, CFStringGetLength(theString)),
2677 m_encoding,
2678 0, //what to put in characters that can't be converted -
2679 //0 tells CFString to return NULL if it meets such a character
2680 false, //not an external representation
2681 (UInt8*) szOut,
2682 nOutSize,
2683 (CFIndex*) &nRealOutSize
2684 );
2685 }
2686
2687 CFRelease(theString);
2688
2689 #if SIZEOF_WCHAR_T == 4
2690 delete[] szUniBuffer;
2691 #endif
2692
2693 return nRealOutSize - 1;
2694 }
2695
2696 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2697
2698 bool IsOk() const
2699 {
2700 return m_encoding != kCFStringEncodingInvalidId &&
2701 CFStringIsEncodingAvailable(m_encoding);
2702 }
2703
2704 private:
2705 CFStringEncoding m_encoding ;
2706 };
2707
2708 #endif // defined(__WXCOCOA__)
2709
2710 // ============================================================================
2711 // Mac conversion classes
2712 // ============================================================================
2713
2714 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2715
2716 class wxMBConv_mac : public wxMBConv
2717 {
2718 public:
2719 wxMBConv_mac()
2720 {
2721 Init(CFStringGetSystemEncoding()) ;
2722 }
2723
2724 wxMBConv_mac(const wxMBConv_mac& conv)
2725 {
2726 Init(conv.m_char_encoding);
2727 }
2728
2729 #if wxUSE_FONTMAP
2730 wxMBConv_mac(const wxChar* name)
2731 {
2732 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2733 }
2734 #endif
2735
2736 wxMBConv_mac(wxFontEncoding encoding)
2737 {
2738 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2739 }
2740
2741 ~wxMBConv_mac()
2742 {
2743 OSStatus status = noErr ;
2744 status = TECDisposeConverter(m_MB2WC_converter);
2745 status = TECDisposeConverter(m_WC2MB_converter);
2746 }
2747
2748
2749 void Init( TextEncodingBase encoding)
2750 {
2751 OSStatus status = noErr ;
2752 m_char_encoding = encoding ;
2753 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2754
2755 status = TECCreateConverter(&m_MB2WC_converter,
2756 m_char_encoding,
2757 m_unicode_encoding);
2758 status = TECCreateConverter(&m_WC2MB_converter,
2759 m_unicode_encoding,
2760 m_char_encoding);
2761 }
2762
2763 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2764 {
2765 OSStatus status = noErr ;
2766 ByteCount byteOutLen ;
2767 ByteCount byteInLen = strlen(psz) + 1;
2768 wchar_t *tbuf = NULL ;
2769 UniChar* ubuf = NULL ;
2770 size_t res = 0 ;
2771
2772 if (buf == NULL)
2773 {
2774 // Apple specs say at least 32
2775 n = wxMax( 32, byteInLen ) ;
2776 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2777 }
2778
2779 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2780
2781 #if SIZEOF_WCHAR_T == 4
2782 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2783 #else
2784 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2785 #endif
2786
2787 status = TECConvertText(
2788 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2789 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2790
2791 #if SIZEOF_WCHAR_T == 4
2792 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2793 // is not properly terminated we get random characters at the end
2794 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2795 wxMBConvUTF16 converter ;
2796 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2797 free( ubuf ) ;
2798 #else
2799 res = byteOutLen / sizeof( UniChar ) ;
2800 #endif
2801
2802 if ( buf == NULL )
2803 free(tbuf) ;
2804
2805 if ( buf && res < n)
2806 buf[res] = 0;
2807
2808 return res ;
2809 }
2810
2811 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2812 {
2813 OSStatus status = noErr ;
2814 ByteCount byteOutLen ;
2815 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2816
2817 char *tbuf = NULL ;
2818
2819 if (buf == NULL)
2820 {
2821 // Apple specs say at least 32
2822 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2823 tbuf = (char*) malloc( n ) ;
2824 }
2825
2826 ByteCount byteBufferLen = n ;
2827 UniChar* ubuf = NULL ;
2828
2829 #if SIZEOF_WCHAR_T == 4
2830 wxMBConvUTF16 converter ;
2831 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2832 byteInLen = unicharlen ;
2833 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2834 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2835 #else
2836 ubuf = (UniChar*) psz ;
2837 #endif
2838
2839 status = TECConvertText(
2840 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2841 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2842
2843 #if SIZEOF_WCHAR_T == 4
2844 free( ubuf ) ;
2845 #endif
2846
2847 if ( buf == NULL )
2848 free(tbuf) ;
2849
2850 size_t res = byteOutLen ;
2851 if ( buf && res < n)
2852 {
2853 buf[res] = 0;
2854
2855 //we need to double-trip to verify it didn't insert any ? in place
2856 //of bogus characters
2857 wxWCharBuffer wcBuf(n);
2858 size_t pszlen = wxWcslen(psz);
2859 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2860 wxWcslen(wcBuf) != pszlen ||
2861 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2862 {
2863 // we didn't obtain the same thing we started from, hence
2864 // the conversion was lossy and we consider that it failed
2865 return wxCONV_FAILED;
2866 }
2867 }
2868
2869 return res ;
2870 }
2871
2872 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2873
2874 bool IsOk() const
2875 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2876
2877 private:
2878 TECObjectRef m_MB2WC_converter;
2879 TECObjectRef m_WC2MB_converter;
2880
2881 TextEncodingBase m_char_encoding;
2882 TextEncodingBase m_unicode_encoding;
2883 };
2884
2885 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2886
2887 // ============================================================================
2888 // wxEncodingConverter based conversion classes
2889 // ============================================================================
2890
2891 #if wxUSE_FONTMAP
2892
2893 class wxMBConv_wxwin : public wxMBConv
2894 {
2895 private:
2896 void Init()
2897 {
2898 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2899 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2900 }
2901
2902 public:
2903 // temporarily just use wxEncodingConverter stuff,
2904 // so that it works while a better implementation is built
2905 wxMBConv_wxwin(const wxChar* name)
2906 {
2907 if (name)
2908 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2909 else
2910 m_enc = wxFONTENCODING_SYSTEM;
2911
2912 Init();
2913 }
2914
2915 wxMBConv_wxwin(wxFontEncoding enc)
2916 {
2917 m_enc = enc;
2918
2919 Init();
2920 }
2921
2922 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2923 {
2924 size_t inbuf = strlen(psz);
2925 if (buf)
2926 {
2927 if (!m2w.Convert(psz, buf))
2928 return wxCONV_FAILED;
2929 }
2930 return inbuf;
2931 }
2932
2933 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2934 {
2935 const size_t inbuf = wxWcslen(psz);
2936 if (buf)
2937 {
2938 if (!w2m.Convert(psz, buf))
2939 return wxCONV_FAILED;
2940 }
2941
2942 return inbuf;
2943 }
2944
2945 virtual size_t GetMBNulLen() const
2946 {
2947 switch ( m_enc )
2948 {
2949 case wxFONTENCODING_UTF16BE:
2950 case wxFONTENCODING_UTF16LE:
2951 return 2;
2952
2953 case wxFONTENCODING_UTF32BE:
2954 case wxFONTENCODING_UTF32LE:
2955 return 4;
2956
2957 default:
2958 return 1;
2959 }
2960 }
2961
2962 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2963
2964 bool IsOk() const { return m_ok; }
2965
2966 public:
2967 wxFontEncoding m_enc;
2968 wxEncodingConverter m2w, w2m;
2969
2970 private:
2971 // were we initialized successfully?
2972 bool m_ok;
2973
2974 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2975 };
2976
2977 // make the constructors available for unit testing
2978 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2979 {
2980 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2981 if ( !result->IsOk() )
2982 {
2983 delete result;
2984 return 0;
2985 }
2986
2987 return result;
2988 }
2989
2990 #endif // wxUSE_FONTMAP
2991
2992 // ============================================================================
2993 // wxCSConv implementation
2994 // ============================================================================
2995
2996 void wxCSConv::Init()
2997 {
2998 m_name = NULL;
2999 m_convReal = NULL;
3000 m_deferred = true;
3001 }
3002
3003 wxCSConv::wxCSConv(const wxChar *charset)
3004 {
3005 Init();
3006
3007 if ( charset )
3008 {
3009 SetName(charset);
3010 }
3011
3012 #if wxUSE_FONTMAP
3013 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3014 #else
3015 m_encoding = wxFONTENCODING_SYSTEM;
3016 #endif
3017 }
3018
3019 wxCSConv::wxCSConv(wxFontEncoding encoding)
3020 {
3021 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3022 {
3023 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3024
3025 encoding = wxFONTENCODING_SYSTEM;
3026 }
3027
3028 Init();
3029
3030 m_encoding = encoding;
3031 }
3032
3033 wxCSConv::~wxCSConv()
3034 {
3035 Clear();
3036 }
3037
3038 wxCSConv::wxCSConv(const wxCSConv& conv)
3039 : wxMBConv()
3040 {
3041 Init();
3042
3043 SetName(conv.m_name);
3044 m_encoding = conv.m_encoding;
3045 }
3046
3047 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3048 {
3049 Clear();
3050
3051 SetName(conv.m_name);
3052 m_encoding = conv.m_encoding;
3053
3054 return *this;
3055 }
3056
3057 void wxCSConv::Clear()
3058 {
3059 free(m_name);
3060 delete m_convReal;
3061
3062 m_name = NULL;
3063 m_convReal = NULL;
3064 }
3065
3066 void wxCSConv::SetName(const wxChar *charset)
3067 {
3068 if (charset)
3069 {
3070 m_name = wxStrdup(charset);
3071 m_deferred = true;
3072 }
3073 }
3074
3075 #if wxUSE_FONTMAP
3076 #include "wx/hashmap.h"
3077
3078 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3079 wxEncodingNameCache );
3080
3081 static wxEncodingNameCache gs_nameCache;
3082 #endif
3083
3084 wxMBConv *wxCSConv::DoCreate() const
3085 {
3086 #if wxUSE_FONTMAP
3087 wxLogTrace(TRACE_STRCONV,
3088 wxT("creating conversion for %s"),
3089 (m_name ? m_name
3090 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3091 #endif // wxUSE_FONTMAP
3092
3093 // check for the special case of ASCII or ISO8859-1 charset: as we have
3094 // special knowledge of it anyhow, we don't need to create a special
3095 // conversion object
3096 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3097 m_encoding == wxFONTENCODING_DEFAULT )
3098 {
3099 // don't convert at all
3100 return NULL;
3101 }
3102
3103 // we trust OS to do conversion better than we can so try external
3104 // conversion methods first
3105 //
3106 // the full order is:
3107 // 1. OS conversion (iconv() under Unix or Win32 API)
3108 // 2. hard coded conversions for UTF
3109 // 3. wxEncodingConverter as fall back
3110
3111 // step (1)
3112 #ifdef HAVE_ICONV
3113 #if !wxUSE_FONTMAP
3114 if ( m_name )
3115 #endif // !wxUSE_FONTMAP
3116 {
3117 wxString name(m_name);
3118 wxFontEncoding encoding(m_encoding);
3119
3120 if ( !name.empty() )
3121 {
3122 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3123 if ( conv->IsOk() )
3124 return conv;
3125
3126 delete conv;
3127
3128 #if wxUSE_FONTMAP
3129 encoding =
3130 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3131 #endif // wxUSE_FONTMAP
3132 }
3133 #if wxUSE_FONTMAP
3134 {
3135 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3136 if ( it != gs_nameCache.end() )
3137 {
3138 if ( it->second.empty() )
3139 return NULL;
3140
3141 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3142 if ( conv->IsOk() )
3143 return conv;
3144
3145 delete conv;
3146 }
3147
3148 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3149
3150 for ( ; *names; ++names )
3151 {
3152 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3153 if ( conv->IsOk() )
3154 {
3155 gs_nameCache[encoding] = *names;
3156 return conv;
3157 }
3158
3159 delete conv;
3160 }
3161
3162 gs_nameCache[encoding] = _T(""); // cache the failure
3163 }
3164 #endif // wxUSE_FONTMAP
3165 }
3166 #endif // HAVE_ICONV
3167
3168 #ifdef wxHAVE_WIN32_MB2WC
3169 {
3170 #if wxUSE_FONTMAP
3171 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3172 : new wxMBConv_win32(m_encoding);
3173 if ( conv->IsOk() )
3174 return conv;
3175
3176 delete conv;
3177 #else
3178 return NULL;
3179 #endif
3180 }
3181 #endif // wxHAVE_WIN32_MB2WC
3182
3183 #if defined(__WXMAC__)
3184 {
3185 // leave UTF16 and UTF32 to the built-ins of wx
3186 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3187 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3188 {
3189 #if wxUSE_FONTMAP
3190 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3191 : new wxMBConv_mac(m_encoding);
3192 #else
3193 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3194 #endif
3195 if ( conv->IsOk() )
3196 return conv;
3197
3198 delete conv;
3199 }
3200 }
3201 #endif
3202
3203 #if defined(__WXCOCOA__)
3204 {
3205 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3206 {
3207 #if wxUSE_FONTMAP
3208 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3209 : new wxMBConv_cocoa(m_encoding);
3210 #else
3211 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3212 #endif
3213
3214 if ( conv->IsOk() )
3215 return conv;
3216
3217 delete conv;
3218 }
3219 }
3220 #endif
3221 // step (2)
3222 wxFontEncoding enc = m_encoding;
3223 #if wxUSE_FONTMAP
3224 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3225 {
3226 // use "false" to suppress interactive dialogs -- we can be called from
3227 // anywhere and popping up a dialog from here is the last thing we want to
3228 // do
3229 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3230 }
3231 #endif // wxUSE_FONTMAP
3232
3233 switch ( enc )
3234 {
3235 case wxFONTENCODING_UTF7:
3236 return new wxMBConvUTF7;
3237
3238 case wxFONTENCODING_UTF8:
3239 return new wxMBConvUTF8;
3240
3241 case wxFONTENCODING_UTF16BE:
3242 return new wxMBConvUTF16BE;
3243
3244 case wxFONTENCODING_UTF16LE:
3245 return new wxMBConvUTF16LE;
3246
3247 case wxFONTENCODING_UTF32BE:
3248 return new wxMBConvUTF32BE;
3249
3250 case wxFONTENCODING_UTF32LE:
3251 return new wxMBConvUTF32LE;
3252
3253 default:
3254 // nothing to do but put here to suppress gcc warnings
3255 break;
3256 }
3257
3258 // step (3)
3259 #if wxUSE_FONTMAP
3260 {
3261 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3262 : new wxMBConv_wxwin(m_encoding);
3263 if ( conv->IsOk() )
3264 return conv;
3265
3266 delete conv;
3267 }
3268 #endif // wxUSE_FONTMAP
3269
3270 // NB: This is a hack to prevent deadlock. What could otherwise happen
3271 // in Unicode build: wxConvLocal creation ends up being here
3272 // because of some failure and logs the error. But wxLog will try to
3273 // attach timestamp, for which it will need wxConvLocal (to convert
3274 // time to char* and then wchar_t*), but that fails, tries to log
3275 // error, but wxLog has a (already locked) critical section that
3276 // guards static buffer.
3277 static bool alreadyLoggingError = false;
3278 if (!alreadyLoggingError)
3279 {
3280 alreadyLoggingError = true;
3281 wxLogError(_("Cannot convert from the charset '%s'!"),
3282 m_name ? m_name
3283 :
3284 #if wxUSE_FONTMAP
3285 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3286 #else // !wxUSE_FONTMAP
3287 wxString::Format(_("encoding %s"), m_encoding).c_str()
3288 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3289 );
3290
3291 alreadyLoggingError = false;
3292 }
3293
3294 return NULL;
3295 }
3296
3297 void wxCSConv::CreateConvIfNeeded() const
3298 {
3299 if ( m_deferred )
3300 {
3301 wxCSConv *self = (wxCSConv *)this; // const_cast
3302
3303 #if wxUSE_INTL
3304 // if we don't have neither the name nor the encoding, use the default
3305 // encoding for this system
3306 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3307 {
3308 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3309 }
3310 #endif // wxUSE_INTL
3311
3312 self->m_convReal = DoCreate();
3313 self->m_deferred = false;
3314 }
3315 }
3316
3317 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3318 {
3319 CreateConvIfNeeded();
3320
3321 if (m_convReal)
3322 return m_convReal->MB2WC(buf, psz, n);
3323
3324 // latin-1 (direct)
3325 size_t len = strlen(psz);
3326
3327 if (buf)
3328 {
3329 for (size_t c = 0; c <= len; c++)
3330 buf[c] = (unsigned char)(psz[c]);
3331 }
3332
3333 return len;
3334 }
3335
3336 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3337 {
3338 CreateConvIfNeeded();
3339
3340 if (m_convReal)
3341 return m_convReal->WC2MB(buf, psz, n);
3342
3343 // latin-1 (direct)
3344 const size_t len = wxWcslen(psz);
3345 if (buf)
3346 {
3347 for (size_t c = 0; c <= len; c++)
3348 {
3349 if (psz[c] > 0xFF)
3350 return wxCONV_FAILED;
3351
3352 buf[c] = (char)psz[c];
3353 }
3354 }
3355 else
3356 {
3357 for (size_t c = 0; c <= len; c++)
3358 {
3359 if (psz[c] > 0xFF)
3360 return wxCONV_FAILED;
3361 }
3362 }
3363
3364 return len;
3365 }
3366
3367 size_t wxCSConv::GetMBNulLen() const
3368 {
3369 CreateConvIfNeeded();
3370
3371 if ( m_convReal )
3372 {
3373 return m_convReal->GetMBNulLen();
3374 }
3375
3376 return 1;
3377 }
3378
3379 // ----------------------------------------------------------------------------
3380 // globals
3381 // ----------------------------------------------------------------------------
3382
3383 #ifdef __WINDOWS__
3384 static wxMBConv_win32 wxConvLibcObj;
3385 #elif defined(__WXMAC__) && !defined(__MACH__)
3386 static wxMBConv_mac wxConvLibcObj ;
3387 #else
3388 static wxMBConvLibc wxConvLibcObj;
3389 #endif
3390
3391 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3392 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3393 static wxMBConvUTF7 wxConvUTF7Obj;
3394 static wxMBConvUTF8 wxConvUTF8Obj;
3395
3396 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3397 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3398 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3399 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3400 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3401 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3402 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3403 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3404 #ifdef __WXOSX__
3405 wxConvUTF8Obj;
3406 #else
3407 wxConvLibcObj;
3408 #endif
3409
3410 #else // !wxUSE_WCHAR_T
3411
3412 // stand-ins in absence of wchar_t
3413 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3414 wxConvISO8859_1,
3415 wxConvLocal,
3416 wxConvUTF8;
3417
3418 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T