]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
cleanup (repeat of v197) - added whitespace around operators, some blank lines, fixed...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifndef WX_PRECOMP
19 #include "wx/intl.h"
20 #include "wx/log.h"
21 #endif
22
23 #include "wx/strconv.h"
24
25 #if wxUSE_WCHAR_T
26
27 #ifdef __WINDOWS__
28 #include "wx/msw/private.h"
29 #include "wx/msw/missing.h"
30 #endif
31
32 #ifndef __WXWINCE__
33 #include <errno.h>
34 #endif
35
36 #include <ctype.h>
37 #include <string.h>
38 #include <stdlib.h>
39
40 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
41 #define wxHAVE_WIN32_MB2WC
42 #endif
43
44 #ifdef __SALFORDC__
45 #include <clib.h>
46 #endif
47
48 #ifdef HAVE_ICONV
49 #include <iconv.h>
50 #include "wx/thread.h"
51 #endif
52
53 #include "wx/encconv.h"
54 #include "wx/fontmap.h"
55 #include "wx/utils.h"
56
57 #ifdef __WXMAC__
58 #ifndef __DARWIN__
59 #include <ATSUnicode.h>
60 #include <TextCommon.h>
61 #include <TextEncodingConverter.h>
62 #endif
63
64 // includes Mac headers
65 #include "wx/mac/private.h"
66 #endif
67
68
69 #define TRACE_STRCONV _T("strconv")
70
71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
72 // be 4 bytes
73 #if SIZEOF_WCHAR_T == 2
74 #define WC_UTF16
75 #endif
76
77
78 // ============================================================================
79 // implementation
80 // ============================================================================
81
82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
83 static bool NotAllNULs(const char *p, size_t n)
84 {
85 while ( n && *p++ == '\0' )
86 n--;
87
88 return n != 0;
89 }
90
91 // ----------------------------------------------------------------------------
92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
93 // ----------------------------------------------------------------------------
94
95 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
96 {
97 if (input <= 0xffff)
98 {
99 if (output)
100 *output = (wxUint16) input;
101
102 return 1;
103 }
104 else if (input >= 0x110000)
105 {
106 return wxCONV_FAILED;
107 }
108 else
109 {
110 if (output)
111 {
112 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
113 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
114 }
115
116 return 2;
117 }
118 }
119
120 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
121 {
122 if ((*input < 0xd800) || (*input > 0xdfff))
123 {
124 output = *input;
125 return 1;
126 }
127 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
128 {
129 output = *input;
130 return wxCONV_FAILED;
131 }
132 else
133 {
134 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
135 return 2;
136 }
137 }
138
139 #ifdef WC_UTF16
140 typedef wchar_t wxDecodeSurrogate_t;
141 #else // !WC_UTF16
142 typedef wxUint16 wxDecodeSurrogate_t;
143 #endif // WC_UTF16/!WC_UTF16
144
145 // returns the next UTF-32 character from the wchar_t buffer and advances the
146 // pointer to the character after this one
147 //
148 // if an invalid character is found, *pSrc is set to NULL, the caller must
149 // check for this
150 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
151 {
152 wxUint32 out;
153 const size_t n = decode_utf16(*pSrc, out);
154 if ( n == wxCONV_FAILED )
155 *pSrc = NULL;
156 else
157 *pSrc += n;
158
159 return out;
160 }
161
162 // ----------------------------------------------------------------------------
163 // wxMBConv
164 // ----------------------------------------------------------------------------
165
166 size_t
167 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
168 const char *src, size_t srcLen) const
169 {
170 // although new conversion classes are supposed to implement this function
171 // directly, the existins ones only implement the old MB2WC() and so, to
172 // avoid to have to rewrite all conversion classes at once, we provide a
173 // default (but not efficient) implementation of this one in terms of the
174 // old function by copying the input to ensure that it's NUL-terminated and
175 // then using MB2WC() to convert it
176
177 // the number of chars [which would be] written to dst [if it were not NULL]
178 size_t dstWritten = 0;
179
180 // the number of NULs terminating this string
181 size_t nulLen wxDUMMY_INITIALIZE(0);
182
183 // if we were not given the input size we just have to assume that the
184 // string is properly terminated as we have no way of knowing how long it
185 // is anyhow, but if we do have the size check whether there are enough
186 // NULs at the end
187 wxCharBuffer bufTmp;
188 const char *srcEnd;
189 if ( srcLen != wxNO_LEN )
190 {
191 // we need to know how to find the end of this string
192 nulLen = GetMBNulLen();
193 if ( nulLen == wxCONV_FAILED )
194 return wxCONV_FAILED;
195
196 // if there are enough NULs we can avoid the copy
197 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
198 {
199 // make a copy in order to properly NUL-terminate the string
200 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
201 char * const p = bufTmp.data();
202 memcpy(p, src, srcLen);
203 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
204 *s = '\0';
205
206 src = bufTmp;
207 }
208
209 srcEnd = src + srcLen;
210 }
211 else // quit after the first loop iteration
212 {
213 srcEnd = NULL;
214 }
215
216 for ( ;; )
217 {
218 // try to convert the current chunk
219 size_t lenChunk = MB2WC(NULL, src, 0);
220 if ( lenChunk == wxCONV_FAILED )
221 return wxCONV_FAILED;
222
223 lenChunk++; // for the L'\0' at the end of this chunk
224
225 dstWritten += lenChunk;
226
227 if ( lenChunk == 1 )
228 {
229 // nothing left in the input string, conversion succeeded
230 break;
231 }
232
233 if ( dst )
234 {
235 if ( dstWritten > dstLen )
236 return wxCONV_FAILED;
237
238 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
239 return wxCONV_FAILED;
240
241 dst += lenChunk;
242 }
243
244 if ( !srcEnd )
245 {
246 // we convert just one chunk in this case as this is the entire
247 // string anyhow
248 break;
249 }
250
251 // advance the input pointer past the end of this chunk
252 while ( NotAllNULs(src, nulLen) )
253 {
254 // notice that we must skip over multiple bytes here as we suppose
255 // that if NUL takes 2 or 4 bytes, then all the other characters do
256 // too and so if advanced by a single byte we might erroneously
257 // detect sequences of NUL bytes in the middle of the input
258 src += nulLen;
259 }
260
261 src += nulLen; // skipping over its terminator as well
262
263 // note that ">=" (and not just "==") is needed here as the terminator
264 // we skipped just above could be inside or just after the buffer
265 // delimited by inEnd
266 if ( src >= srcEnd )
267 break;
268 }
269
270 return dstWritten;
271 }
272
273 size_t
274 wxMBConv::FromWChar(char *dst, size_t dstLen,
275 const wchar_t *src, size_t srcLen) const
276 {
277 // the number of chars [which would be] written to dst [if it were not NULL]
278 size_t dstWritten = 0;
279
280 // make a copy of the input string unless it is already properly
281 // NUL-terminated
282 //
283 // if we don't know its length we have no choice but to assume that it is,
284 // indeed, properly terminated
285 wxWCharBuffer bufTmp;
286 if ( srcLen == wxNO_LEN )
287 {
288 srcLen = wxWcslen(src) + 1;
289 }
290 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
291 {
292 // make a copy in order to properly NUL-terminate the string
293 bufTmp = wxWCharBuffer(srcLen);
294 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
295 src = bufTmp;
296 }
297
298 const size_t lenNul = GetMBNulLen();
299 for ( const wchar_t * const srcEnd = src + srcLen;
300 src < srcEnd;
301 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
302 {
303 // try to convert the current chunk
304 size_t lenChunk = WC2MB(NULL, src, 0);
305
306 if ( lenChunk == wxCONV_FAILED )
307 return wxCONV_FAILED;
308
309 lenChunk += lenNul;
310 dstWritten += lenChunk;
311
312 if ( dst )
313 {
314 if ( dstWritten > dstLen )
315 return wxCONV_FAILED;
316
317 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
318 return wxCONV_FAILED;
319
320 dst += lenChunk;
321 }
322 }
323
324 return dstWritten;
325 }
326
327 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
328 {
329 size_t rc = ToWChar(outBuff, outLen, inBuff);
330 if ( rc != wxCONV_FAILED )
331 {
332 // ToWChar() returns the buffer length, i.e. including the trailing
333 // NUL, while this method doesn't take it into account
334 rc--;
335 }
336
337 return rc;
338 }
339
340 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
341 {
342 size_t rc = FromWChar(outBuff, outLen, inBuff);
343 if ( rc != wxCONV_FAILED )
344 {
345 rc -= GetMBNulLen();
346 }
347
348 return rc;
349 }
350
351 wxMBConv::~wxMBConv()
352 {
353 // nothing to do here (necessary for Darwin linking probably)
354 }
355
356 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
357 {
358 if ( psz )
359 {
360 // calculate the length of the buffer needed first
361 const size_t nLen = MB2WC(NULL, psz, 0);
362 if ( nLen != wxCONV_FAILED )
363 {
364 // now do the actual conversion
365 wxWCharBuffer buf(nLen /* +1 added implicitly */);
366
367 // +1 for the trailing NULL
368 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
369 return buf;
370 }
371 }
372
373 return wxWCharBuffer();
374 }
375
376 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
377 {
378 if ( pwz )
379 {
380 const size_t nLen = WC2MB(NULL, pwz, 0);
381 if ( nLen != wxCONV_FAILED )
382 {
383 // extra space for trailing NUL(s)
384 static const size_t extraLen = GetMaxMBNulLen();
385
386 wxCharBuffer buf(nLen + extraLen - 1);
387 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
388 return buf;
389 }
390 }
391
392 return wxCharBuffer();
393 }
394
395 const wxWCharBuffer
396 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
397 {
398 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
399 if ( dstLen != wxCONV_FAILED )
400 {
401 wxWCharBuffer wbuf(dstLen - 1);
402 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
403 {
404 if ( outLen )
405 {
406 *outLen = dstLen;
407 if ( wbuf[dstLen - 1] == L'\0' )
408 (*outLen)--;
409 }
410
411 return wbuf;
412 }
413 }
414
415 if ( outLen )
416 *outLen = 0;
417
418 return wxWCharBuffer();
419 }
420
421 const wxCharBuffer
422 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
423 {
424 const size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
425 if ( dstLen != wxCONV_FAILED )
426 {
427 wxCharBuffer buf(dstLen - 1);
428 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
429 {
430 if ( outLen )
431 {
432 *outLen = dstLen;
433
434 const size_t nulLen = GetMBNulLen();
435 if ( !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
436 {
437 // in this case the output is NUL-terminated and we're not
438 // supposed to count NUL
439 (*outLen) -= nulLen;
440 }
441 }
442
443 return buf;
444 }
445 }
446
447 if ( outLen )
448 *outLen = 0;
449
450 return wxCharBuffer();
451 }
452
453 // ----------------------------------------------------------------------------
454 // wxMBConvLibc
455 // ----------------------------------------------------------------------------
456
457 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
458 {
459 return wxMB2WC(buf, psz, n);
460 }
461
462 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
463 {
464 return wxWC2MB(buf, psz, n);
465 }
466
467 // ----------------------------------------------------------------------------
468 // wxConvBrokenFileNames
469 // ----------------------------------------------------------------------------
470
471 #ifdef __UNIX__
472
473 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
474 {
475 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
476 || wxStricmp(charset, _T("UTF8")) == 0 )
477 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
478 else
479 m_conv = new wxCSConv(charset);
480 }
481
482 #endif // __UNIX__
483
484 // ----------------------------------------------------------------------------
485 // UTF-7
486 // ----------------------------------------------------------------------------
487
488 // Implementation (C) 2004 Fredrik Roubert
489
490 //
491 // BASE64 decoding table
492 //
493 static const unsigned char utf7unb64[] =
494 {
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
501 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
502 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
504 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
505 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
506 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
508 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
509 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
510 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
527 };
528
529 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
530 {
531 size_t len = 0;
532
533 while ( *psz && (!buf || (len < n)) )
534 {
535 unsigned char cc = *psz++;
536 if (cc != '+')
537 {
538 // plain ASCII char
539 if (buf)
540 *buf++ = cc;
541 len++;
542 }
543 else if (*psz == '-')
544 {
545 // encoded plus sign
546 if (buf)
547 *buf++ = cc;
548 len++;
549 psz++;
550 }
551 else // start of BASE64 encoded string
552 {
553 bool lsb, ok;
554 unsigned int d, l;
555 for ( ok = lsb = false, d = 0, l = 0;
556 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
557 psz++ )
558 {
559 d <<= 6;
560 d += cc;
561 for (l += 6; l >= 8; lsb = !lsb)
562 {
563 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
564 if (lsb)
565 {
566 if (buf)
567 *buf++ |= c;
568 len ++;
569 }
570 else
571 {
572 if (buf)
573 *buf = (wchar_t)(c << 8);
574 }
575
576 ok = true;
577 }
578 }
579
580 if ( !ok )
581 {
582 // in valid UTF7 we should have valid characters after '+'
583 return wxCONV_FAILED;
584 }
585
586 if (*psz == '-')
587 psz++;
588 }
589 }
590
591 if ( buf && (len < n) )
592 *buf = '\0';
593
594 return len;
595 }
596
597 //
598 // BASE64 encoding table
599 //
600 static const unsigned char utf7enb64[] =
601 {
602 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
603 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
604 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
605 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
606 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
607 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
608 'w', 'x', 'y', 'z', '0', '1', '2', '3',
609 '4', '5', '6', '7', '8', '9', '+', '/'
610 };
611
612 //
613 // UTF-7 encoding table
614 //
615 // 0 - Set D (directly encoded characters)
616 // 1 - Set O (optional direct characters)
617 // 2 - whitespace characters (optional)
618 // 3 - special characters
619 //
620 static const unsigned char utf7encode[128] =
621 {
622 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
623 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
624 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
626 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
628 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
630 };
631
632 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
633 {
634 size_t len = 0;
635
636 while (*psz && ((!buf) || (len < n)))
637 {
638 wchar_t cc = *psz++;
639 if (cc < 0x80 && utf7encode[cc] < 1)
640 {
641 // plain ASCII char
642 if (buf)
643 *buf++ = (char)cc;
644
645 len++;
646 }
647 #ifndef WC_UTF16
648 else if (((wxUint32)cc) > 0xffff)
649 {
650 // no surrogate pair generation (yet?)
651 return wxCONV_FAILED;
652 }
653 #endif
654 else
655 {
656 if (buf)
657 *buf++ = '+';
658
659 len++;
660 if (cc != '+')
661 {
662 // BASE64 encode string
663 unsigned int lsb, d, l;
664 for (d = 0, l = 0; /*nothing*/; psz++)
665 {
666 for (lsb = 0; lsb < 2; lsb ++)
667 {
668 d <<= 8;
669 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
670
671 for (l += 8; l >= 6; )
672 {
673 l -= 6;
674 if (buf)
675 *buf++ = utf7enb64[(d >> l) % 64];
676 len++;
677 }
678 }
679
680 cc = *psz;
681 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
682 break;
683 }
684
685 if (l != 0)
686 {
687 if (buf)
688 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
689
690 len++;
691 }
692 }
693
694 if (buf)
695 *buf++ = '-';
696 len++;
697 }
698 }
699
700 if (buf && (len < n))
701 *buf = 0;
702
703 return len;
704 }
705
706 // ----------------------------------------------------------------------------
707 // UTF-8
708 // ----------------------------------------------------------------------------
709
710 static wxUint32 utf8_max[]=
711 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
712
713 // boundaries of the private use area we use to (temporarily) remap invalid
714 // characters invalid in a UTF-8 encoded string
715 const wxUint32 wxUnicodePUA = 0x100000;
716 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
717
718 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
719 {
720 size_t len = 0;
721
722 while (*psz && ((!buf) || (len < n)))
723 {
724 const char *opsz = psz;
725 bool invalid = false;
726 unsigned char cc = *psz++, fc = cc;
727 unsigned cnt;
728 for (cnt = 0; fc & 0x80; cnt++)
729 fc <<= 1;
730
731 if (!cnt)
732 {
733 // plain ASCII char
734 if (buf)
735 *buf++ = cc;
736 len++;
737
738 // escape the escape character for octal escapes
739 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
740 && cc == '\\' && (!buf || len < n))
741 {
742 if (buf)
743 *buf++ = cc;
744 len++;
745 }
746 }
747 else
748 {
749 cnt--;
750 if (!cnt)
751 {
752 // invalid UTF-8 sequence
753 invalid = true;
754 }
755 else
756 {
757 unsigned ocnt = cnt - 1;
758 wxUint32 res = cc & (0x3f >> cnt);
759 while (cnt--)
760 {
761 cc = *psz;
762 if ((cc & 0xC0) != 0x80)
763 {
764 // invalid UTF-8 sequence
765 invalid = true;
766 break;
767 }
768
769 psz++;
770 res = (res << 6) | (cc & 0x3f);
771 }
772
773 if (invalid || res <= utf8_max[ocnt])
774 {
775 // illegal UTF-8 encoding
776 invalid = true;
777 }
778 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
779 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
780 {
781 // if one of our PUA characters turns up externally
782 // it must also be treated as an illegal sequence
783 // (a bit like you have to escape an escape character)
784 invalid = true;
785 }
786 else
787 {
788 #ifdef WC_UTF16
789 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
790 size_t pa = encode_utf16(res, (wxUint16 *)buf);
791 if (pa == wxCONV_FAILED)
792 {
793 invalid = true;
794 }
795 else
796 {
797 if (buf)
798 buf += pa;
799 len += pa;
800 }
801 #else // !WC_UTF16
802 if (buf)
803 *buf++ = (wchar_t)res;
804 len++;
805 #endif // WC_UTF16/!WC_UTF16
806 }
807 }
808
809 if (invalid)
810 {
811 if (m_options & MAP_INVALID_UTF8_TO_PUA)
812 {
813 while (opsz < psz && (!buf || len < n))
814 {
815 #ifdef WC_UTF16
816 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
817 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
818 wxASSERT(pa != wxCONV_FAILED);
819 if (buf)
820 buf += pa;
821 opsz++;
822 len += pa;
823 #else
824 if (buf)
825 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
826 opsz++;
827 len++;
828 #endif
829 }
830 }
831 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
832 {
833 while (opsz < psz && (!buf || len < n))
834 {
835 if ( buf && len + 3 < n )
836 {
837 unsigned char on = *opsz;
838 *buf++ = L'\\';
839 *buf++ = (wchar_t)( L'0' + on / 0100 );
840 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
841 *buf++ = (wchar_t)( L'0' + on % 010 );
842 }
843
844 opsz++;
845 len += 4;
846 }
847 }
848 else // MAP_INVALID_UTF8_NOT
849 {
850 return wxCONV_FAILED;
851 }
852 }
853 }
854 }
855
856 if (buf && (len < n))
857 *buf = 0;
858
859 return len;
860 }
861
862 static inline bool isoctal(wchar_t wch)
863 {
864 return L'0' <= wch && wch <= L'7';
865 }
866
867 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
868 {
869 size_t len = 0;
870
871 while (*psz && ((!buf) || (len < n)))
872 {
873 wxUint32 cc;
874
875 #ifdef WC_UTF16
876 // cast is ok for WC_UTF16
877 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
878 psz += (pa == wxCONV_FAILED) ? 1 : pa;
879 #else
880 cc = (*psz++) & 0x7fffffff;
881 #endif
882
883 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
884 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
885 {
886 if (buf)
887 *buf++ = (char)(cc - wxUnicodePUA);
888 len++;
889 }
890 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
891 && cc == L'\\' && psz[0] == L'\\' )
892 {
893 if (buf)
894 *buf++ = (char)cc;
895 psz++;
896 len++;
897 }
898 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
899 cc == L'\\' &&
900 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
901 {
902 if (buf)
903 {
904 *buf++ = (char) ((psz[0] - L'0') * 0100 +
905 (psz[1] - L'0') * 010 +
906 (psz[2] - L'0'));
907 }
908
909 psz += 3;
910 len++;
911 }
912 else
913 {
914 unsigned cnt;
915 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
916 {
917 }
918
919 if (!cnt)
920 {
921 // plain ASCII char
922 if (buf)
923 *buf++ = (char) cc;
924 len++;
925 }
926 else
927 {
928 len += cnt + 1;
929 if (buf)
930 {
931 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
932 while (cnt--)
933 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
934 }
935 }
936 }
937 }
938
939 if (buf && (len < n))
940 *buf = 0;
941
942 return len;
943 }
944
945 // ============================================================================
946 // UTF-16
947 // ============================================================================
948
949 #ifdef WORDS_BIGENDIAN
950 #define wxMBConvUTF16straight wxMBConvUTF16BE
951 #define wxMBConvUTF16swap wxMBConvUTF16LE
952 #else
953 #define wxMBConvUTF16swap wxMBConvUTF16BE
954 #define wxMBConvUTF16straight wxMBConvUTF16LE
955 #endif
956
957 /* static */
958 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
959 {
960 if ( srcLen == wxNO_LEN )
961 {
962 // count the number of bytes in input, including the trailing NULs
963 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
964 for ( srcLen = 1; *inBuff++; srcLen++ )
965 ;
966
967 srcLen *= BYTES_PER_CHAR;
968 }
969 else // we already have the length
970 {
971 // we can only convert an entire number of UTF-16 characters
972 if ( srcLen % BYTES_PER_CHAR )
973 return wxCONV_FAILED;
974 }
975
976 return srcLen;
977 }
978
979 // case when in-memory representation is UTF-16 too
980 #ifdef WC_UTF16
981
982 // ----------------------------------------------------------------------------
983 // conversions without endianness change
984 // ----------------------------------------------------------------------------
985
986 size_t
987 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
988 const char *src, size_t srcLen) const
989 {
990 // set up the scene for using memcpy() (which is presumably more efficient
991 // than copying the bytes one by one)
992 srcLen = GetLength(src, srcLen);
993 if ( srcLen == wxNO_LEN )
994 return wxCONV_FAILED;
995
996 const size_t inLen = srcLen / BYTES_PER_CHAR;
997 if ( dst )
998 {
999 if ( dstLen < inLen )
1000 return wxCONV_FAILED;
1001
1002 memcpy(dst, src, srcLen);
1003 }
1004
1005 return inLen;
1006 }
1007
1008 size_t
1009 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1010 const wchar_t *src, size_t srcLen) const
1011 {
1012 if ( srcLen == wxNO_LEN )
1013 srcLen = wxWcslen(src) + 1;
1014
1015 srcLen *= BYTES_PER_CHAR;
1016
1017 if ( dst )
1018 {
1019 if ( dstLen < srcLen )
1020 return wxCONV_FAILED;
1021
1022 memcpy(dst, src, srcLen);
1023 }
1024
1025 return srcLen;
1026 }
1027
1028 // ----------------------------------------------------------------------------
1029 // endian-reversing conversions
1030 // ----------------------------------------------------------------------------
1031
1032 size_t
1033 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1034 const char *src, size_t srcLen) const
1035 {
1036 srcLen = GetLength(src, srcLen);
1037 if ( srcLen == wxNO_LEN )
1038 return wxCONV_FAILED;
1039
1040 srcLen /= BYTES_PER_CHAR;
1041
1042 if ( dst )
1043 {
1044 if ( dstLen < srcLen )
1045 return wxCONV_FAILED;
1046
1047 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1048 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1049 {
1050 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1051 }
1052 }
1053
1054 return srcLen;
1055 }
1056
1057 size_t
1058 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1059 const wchar_t *src, size_t srcLen) const
1060 {
1061 if ( srcLen == wxNO_LEN )
1062 srcLen = wxWcslen(src) + 1;
1063
1064 srcLen *= BYTES_PER_CHAR;
1065
1066 if ( dst )
1067 {
1068 if ( dstLen < srcLen )
1069 return wxCONV_FAILED;
1070
1071 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1072 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1073 {
1074 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1075 }
1076 }
1077
1078 return srcLen;
1079 }
1080
1081 #else // !WC_UTF16: wchar_t is UTF-32
1082
1083 // ----------------------------------------------------------------------------
1084 // conversions without endianness change
1085 // ----------------------------------------------------------------------------
1086
1087 size_t
1088 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1089 const char *src, size_t srcLen) const
1090 {
1091 srcLen = GetLength(src, srcLen);
1092 if ( srcLen == wxNO_LEN )
1093 return wxCONV_FAILED;
1094
1095 const size_t inLen = srcLen / BYTES_PER_CHAR;
1096 if ( !dst )
1097 {
1098 // optimization: return maximal space which could be needed for this
1099 // string even if the real size could be smaller if the buffer contains
1100 // any surrogates
1101 return inLen;
1102 }
1103
1104 size_t outLen = 0;
1105 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1106 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1107 {
1108 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1109 if ( !inBuff )
1110 return wxCONV_FAILED;
1111
1112 if ( ++outLen > dstLen )
1113 return wxCONV_FAILED;
1114
1115 *dst++ = ch;
1116 }
1117
1118
1119 return outLen;
1120 }
1121
1122 size_t
1123 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1124 const wchar_t *src, size_t srcLen) const
1125 {
1126 if ( srcLen == wxNO_LEN )
1127 srcLen = wxWcslen(src) + 1;
1128
1129 size_t outLen = 0;
1130 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1131 for ( size_t n = 0; n < srcLen; n++ )
1132 {
1133 wxUint16 cc[2];
1134 const size_t numChars = encode_utf16(*src++, cc);
1135 if ( numChars == wxCONV_FAILED )
1136 return wxCONV_FAILED;
1137
1138 outLen += numChars * BYTES_PER_CHAR;
1139 if ( outBuff )
1140 {
1141 if ( outLen > dstLen )
1142 return wxCONV_FAILED;
1143
1144 *outBuff++ = cc[0];
1145 if ( numChars == 2 )
1146 {
1147 // second character of a surrogate
1148 *outBuff++ = cc[1];
1149 }
1150 }
1151 }
1152
1153 return outLen;
1154 }
1155
1156 // ----------------------------------------------------------------------------
1157 // endian-reversing conversions
1158 // ----------------------------------------------------------------------------
1159
1160 size_t
1161 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1162 const char *src, size_t srcLen) const
1163 {
1164 srcLen = GetLength(src, srcLen);
1165 if ( srcLen == wxNO_LEN )
1166 return wxCONV_FAILED;
1167
1168 const size_t inLen = srcLen / BYTES_PER_CHAR;
1169 if ( !dst )
1170 {
1171 // optimization: return maximal space which could be needed for this
1172 // string even if the real size could be smaller if the buffer contains
1173 // any surrogates
1174 return inLen;
1175 }
1176
1177 size_t outLen = 0;
1178 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1179 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1180 {
1181 wxUint32 ch;
1182 wxUint16 tmp[2];
1183
1184 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1185 inBuff++;
1186 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1187
1188 const size_t numChars = decode_utf16(tmp, ch);
1189 if ( numChars == wxCONV_FAILED )
1190 return wxCONV_FAILED;
1191
1192 if ( numChars == 2 )
1193 inBuff++;
1194
1195 if ( ++outLen > dstLen )
1196 return wxCONV_FAILED;
1197
1198 *dst++ = ch;
1199 }
1200
1201
1202 return outLen;
1203 }
1204
1205 size_t
1206 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1207 const wchar_t *src, size_t srcLen) const
1208 {
1209 if ( srcLen == wxNO_LEN )
1210 srcLen = wxWcslen(src) + 1;
1211
1212 size_t outLen = 0;
1213 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1214 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1215 {
1216 wxUint16 cc[2];
1217 const size_t numChars = encode_utf16(*src, cc);
1218 if ( numChars == wxCONV_FAILED )
1219 return wxCONV_FAILED;
1220
1221 outLen += numChars * BYTES_PER_CHAR;
1222 if ( outBuff )
1223 {
1224 if ( outLen > dstLen )
1225 return wxCONV_FAILED;
1226
1227 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1228 if ( numChars == 2 )
1229 {
1230 // second character of a surrogate
1231 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1232 }
1233 }
1234 }
1235
1236 return outLen;
1237 }
1238
1239 #endif // WC_UTF16/!WC_UTF16
1240
1241
1242 // ============================================================================
1243 // UTF-32
1244 // ============================================================================
1245
1246 #ifdef WORDS_BIGENDIAN
1247 #define wxMBConvUTF32straight wxMBConvUTF32BE
1248 #define wxMBConvUTF32swap wxMBConvUTF32LE
1249 #else
1250 #define wxMBConvUTF32swap wxMBConvUTF32BE
1251 #define wxMBConvUTF32straight wxMBConvUTF32LE
1252 #endif
1253
1254
1255 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1256 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1257
1258 /* static */
1259 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1260 {
1261 if ( srcLen == wxNO_LEN )
1262 {
1263 // count the number of bytes in input, including the trailing NULs
1264 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1265 for ( srcLen = 1; *inBuff++; srcLen++ )
1266 ;
1267
1268 srcLen *= BYTES_PER_CHAR;
1269 }
1270 else // we already have the length
1271 {
1272 // we can only convert an entire number of UTF-32 characters
1273 if ( srcLen % BYTES_PER_CHAR )
1274 return wxCONV_FAILED;
1275 }
1276
1277 return srcLen;
1278 }
1279
1280 // case when in-memory representation is UTF-16
1281 #ifdef WC_UTF16
1282
1283 // ----------------------------------------------------------------------------
1284 // conversions without endianness change
1285 // ----------------------------------------------------------------------------
1286
1287 size_t
1288 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1289 const char *src, size_t srcLen) const
1290 {
1291 srcLen = GetLength(src, srcLen);
1292 if ( srcLen == wxNO_LEN )
1293 return wxCONV_FAILED;
1294
1295 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1296 const size_t inLen = srcLen / BYTES_PER_CHAR;
1297 size_t outLen = 0;
1298 for ( size_t n = 0; n < inLen; n++ )
1299 {
1300 wxUint16 cc[2];
1301 const size_t numChars = encode_utf16(*inBuff++, cc);
1302 if ( numChars == wxCONV_FAILED )
1303 return wxCONV_FAILED;
1304
1305 outLen += numChars;
1306 if ( dst )
1307 {
1308 if ( outLen > dstLen )
1309 return wxCONV_FAILED;
1310
1311 *dst++ = cc[0];
1312 if ( numChars == 2 )
1313 {
1314 // second character of a surrogate
1315 *dst++ = cc[1];
1316 }
1317 }
1318 }
1319
1320 return outLen;
1321 }
1322
1323 size_t
1324 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1325 const wchar_t *src, size_t srcLen) const
1326 {
1327 if ( srcLen == wxNO_LEN )
1328 srcLen = wxWcslen(src) + 1;
1329
1330 if ( !dst )
1331 {
1332 // optimization: return maximal space which could be needed for this
1333 // string instead of the exact amount which could be less if there are
1334 // any surrogates in the input
1335 //
1336 // we consider that surrogates are rare enough to make it worthwhile to
1337 // avoid running the loop below at the cost of slightly extra memory
1338 // consumption
1339 return srcLen * BYTES_PER_CHAR;
1340 }
1341
1342 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1343 size_t outLen = 0;
1344 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1345 {
1346 const wxUint32 ch = wxDecodeSurrogate(&src);
1347 if ( !src )
1348 return wxCONV_FAILED;
1349
1350 outLen += BYTES_PER_CHAR;
1351
1352 if ( outLen > dstLen )
1353 return wxCONV_FAILED;
1354
1355 *outBuff++ = ch;
1356 }
1357
1358 return outLen;
1359 }
1360
1361 // ----------------------------------------------------------------------------
1362 // endian-reversing conversions
1363 // ----------------------------------------------------------------------------
1364
1365 size_t
1366 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1367 const char *src, size_t srcLen) const
1368 {
1369 srcLen = GetLength(src, srcLen);
1370 if ( srcLen == wxNO_LEN )
1371 return wxCONV_FAILED;
1372
1373 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1374 const size_t inLen = srcLen / BYTES_PER_CHAR;
1375 size_t outLen = 0;
1376 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1377 {
1378 wxUint16 cc[2];
1379 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1380 if ( numChars == wxCONV_FAILED )
1381 return wxCONV_FAILED;
1382
1383 outLen += numChars;
1384 if ( dst )
1385 {
1386 if ( outLen > dstLen )
1387 return wxCONV_FAILED;
1388
1389 *dst++ = cc[0];
1390 if ( numChars == 2 )
1391 {
1392 // second character of a surrogate
1393 *dst++ = cc[1];
1394 }
1395 }
1396 }
1397
1398 return outLen;
1399 }
1400
1401 size_t
1402 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1403 const wchar_t *src, size_t srcLen) const
1404 {
1405 if ( srcLen == wxNO_LEN )
1406 srcLen = wxWcslen(src) + 1;
1407
1408 if ( !dst )
1409 {
1410 // optimization: return maximal space which could be needed for this
1411 // string instead of the exact amount which could be less if there are
1412 // any surrogates in the input
1413 //
1414 // we consider that surrogates are rare enough to make it worthwhile to
1415 // avoid running the loop below at the cost of slightly extra memory
1416 // consumption
1417 return srcLen*BYTES_PER_CHAR;
1418 }
1419
1420 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1421 size_t outLen = 0;
1422 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1423 {
1424 const wxUint32 ch = wxDecodeSurrogate(&src);
1425 if ( !src )
1426 return wxCONV_FAILED;
1427
1428 outLen += BYTES_PER_CHAR;
1429
1430 if ( outLen > dstLen )
1431 return wxCONV_FAILED;
1432
1433 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1434 }
1435
1436 return outLen;
1437 }
1438
1439 #else // !WC_UTF16: wchar_t is UTF-32
1440
1441 // ----------------------------------------------------------------------------
1442 // conversions without endianness change
1443 // ----------------------------------------------------------------------------
1444
1445 size_t
1446 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1447 const char *src, size_t srcLen) const
1448 {
1449 // use memcpy() as it should be much faster than hand-written loop
1450 srcLen = GetLength(src, srcLen);
1451 if ( srcLen == wxNO_LEN )
1452 return wxCONV_FAILED;
1453
1454 const size_t inLen = srcLen/BYTES_PER_CHAR;
1455 if ( dst )
1456 {
1457 if ( dstLen < inLen )
1458 return wxCONV_FAILED;
1459
1460 memcpy(dst, src, srcLen);
1461 }
1462
1463 return inLen;
1464 }
1465
1466 size_t
1467 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1468 const wchar_t *src, size_t srcLen) const
1469 {
1470 if ( srcLen == wxNO_LEN )
1471 srcLen = wxWcslen(src) + 1;
1472
1473 srcLen *= BYTES_PER_CHAR;
1474
1475 if ( dst )
1476 {
1477 if ( dstLen < srcLen )
1478 return wxCONV_FAILED;
1479
1480 memcpy(dst, src, srcLen);
1481 }
1482
1483 return srcLen;
1484 }
1485
1486 // ----------------------------------------------------------------------------
1487 // endian-reversing conversions
1488 // ----------------------------------------------------------------------------
1489
1490 size_t
1491 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1492 const char *src, size_t srcLen) const
1493 {
1494 srcLen = GetLength(src, srcLen);
1495 if ( srcLen == wxNO_LEN )
1496 return wxCONV_FAILED;
1497
1498 srcLen /= BYTES_PER_CHAR;
1499
1500 if ( dst )
1501 {
1502 if ( dstLen < srcLen )
1503 return wxCONV_FAILED;
1504
1505 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1506 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1507 {
1508 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1509 }
1510 }
1511
1512 return srcLen;
1513 }
1514
1515 size_t
1516 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1517 const wchar_t *src, size_t srcLen) const
1518 {
1519 if ( srcLen == wxNO_LEN )
1520 srcLen = wxWcslen(src) + 1;
1521
1522 srcLen *= BYTES_PER_CHAR;
1523
1524 if ( dst )
1525 {
1526 if ( dstLen < srcLen )
1527 return wxCONV_FAILED;
1528
1529 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1530 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1531 {
1532 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1533 }
1534 }
1535
1536 return srcLen;
1537 }
1538
1539 #endif // WC_UTF16/!WC_UTF16
1540
1541
1542 // ============================================================================
1543 // The classes doing conversion using the iconv_xxx() functions
1544 // ============================================================================
1545
1546 #ifdef HAVE_ICONV
1547
1548 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1549 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1550 // (unless there's yet another bug in glibc) the only case when iconv()
1551 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1552 // left in the input buffer -- when _real_ error occurs,
1553 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1554 // iconv() failure.
1555 // [This bug does not appear in glibc 2.2.]
1556 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1557 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1558 (errno != E2BIG || bufLeft != 0))
1559 #else
1560 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1561 #endif
1562
1563 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1564
1565 #define ICONV_T_INVALID ((iconv_t)-1)
1566
1567 #if SIZEOF_WCHAR_T == 4
1568 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1569 #define WC_ENC wxFONTENCODING_UTF32
1570 #elif SIZEOF_WCHAR_T == 2
1571 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1572 #define WC_ENC wxFONTENCODING_UTF16
1573 #else // sizeof(wchar_t) != 2 nor 4
1574 // does this ever happen?
1575 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1576 #endif
1577
1578 // ----------------------------------------------------------------------------
1579 // wxMBConv_iconv: encapsulates an iconv character set
1580 // ----------------------------------------------------------------------------
1581
1582 class wxMBConv_iconv : public wxMBConv
1583 {
1584 public:
1585 wxMBConv_iconv(const wxChar *name);
1586 virtual ~wxMBConv_iconv();
1587
1588 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1589 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1590
1591 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1592 virtual size_t GetMBNulLen() const;
1593
1594 virtual wxMBConv *Clone() const
1595 {
1596 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1597 p->m_minMBCharWidth = m_minMBCharWidth;
1598 return p;
1599 }
1600
1601 bool IsOk() const
1602 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1603
1604 protected:
1605 // the iconv handlers used to translate from multibyte
1606 // to wide char and in the other direction
1607 iconv_t m2w,
1608 w2m;
1609
1610 #if wxUSE_THREADS
1611 // guards access to m2w and w2m objects
1612 wxMutex m_iconvMutex;
1613 #endif
1614
1615 private:
1616 // the name (for iconv_open()) of a wide char charset -- if none is
1617 // available on this machine, it will remain NULL
1618 static wxString ms_wcCharsetName;
1619
1620 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1621 // different endian-ness than the native one
1622 static bool ms_wcNeedsSwap;
1623
1624
1625 // name of the encoding handled by this conversion
1626 wxString m_name;
1627
1628 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1629 // initially
1630 size_t m_minMBCharWidth;
1631 };
1632
1633 // make the constructor available for unit testing
1634 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1635 {
1636 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1637 if ( !result->IsOk() )
1638 {
1639 delete result;
1640 return 0;
1641 }
1642
1643 return result;
1644 }
1645
1646 wxString wxMBConv_iconv::ms_wcCharsetName;
1647 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1648
1649 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1650 : m_name(name)
1651 {
1652 m_minMBCharWidth = 0;
1653
1654 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1655 // names for the charsets
1656 const wxCharBuffer cname(wxString(name).ToAscii());
1657
1658 // check for charset that represents wchar_t:
1659 if ( ms_wcCharsetName.empty() )
1660 {
1661 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1662
1663 #if wxUSE_FONTMAP
1664 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1665 #else // !wxUSE_FONTMAP
1666 static const wxChar *names[] =
1667 {
1668 #if SIZEOF_WCHAR_T == 4
1669 _T("UCS-4"),
1670 #elif SIZEOF_WCHAR_T = 2
1671 _T("UCS-2"),
1672 #endif
1673 NULL
1674 };
1675 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1676
1677 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1678 {
1679 const wxString nameCS(*names);
1680
1681 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1682 wxString nameXE(nameCS);
1683
1684 #ifdef WORDS_BIGENDIAN
1685 nameXE += _T("BE");
1686 #else // little endian
1687 nameXE += _T("LE");
1688 #endif
1689
1690 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1691 nameXE.c_str());
1692
1693 m2w = iconv_open(nameXE.ToAscii(), cname);
1694 if ( m2w == ICONV_T_INVALID )
1695 {
1696 // try charset w/o bytesex info (e.g. "UCS4")
1697 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1698 nameCS.c_str());
1699 m2w = iconv_open(nameCS.ToAscii(), cname);
1700
1701 // and check for bytesex ourselves:
1702 if ( m2w != ICONV_T_INVALID )
1703 {
1704 char buf[2], *bufPtr;
1705 wchar_t wbuf[2], *wbufPtr;
1706 size_t insz, outsz;
1707 size_t res;
1708
1709 buf[0] = 'A';
1710 buf[1] = 0;
1711 wbuf[0] = 0;
1712 insz = 2;
1713 outsz = SIZEOF_WCHAR_T * 2;
1714 wbufPtr = wbuf;
1715 bufPtr = buf;
1716
1717 res = iconv(
1718 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1719 (char**)&wbufPtr, &outsz);
1720
1721 if (ICONV_FAILED(res, insz))
1722 {
1723 wxLogLastError(wxT("iconv"));
1724 wxLogError(_("Conversion to charset '%s' doesn't work."),
1725 nameCS.c_str());
1726 }
1727 else // ok, can convert to this encoding, remember it
1728 {
1729 ms_wcCharsetName = nameCS;
1730 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1731 }
1732 }
1733 }
1734 else // use charset not requiring byte swapping
1735 {
1736 ms_wcCharsetName = nameXE;
1737 }
1738 }
1739
1740 wxLogTrace(TRACE_STRCONV,
1741 wxT("iconv wchar_t charset is \"%s\"%s"),
1742 ms_wcCharsetName.empty() ? _T("<none>")
1743 : ms_wcCharsetName.c_str(),
1744 ms_wcNeedsSwap ? _T(" (needs swap)")
1745 : _T(""));
1746 }
1747 else // we already have ms_wcCharsetName
1748 {
1749 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1750 }
1751
1752 if ( ms_wcCharsetName.empty() )
1753 {
1754 w2m = ICONV_T_INVALID;
1755 }
1756 else
1757 {
1758 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1759 if ( w2m == ICONV_T_INVALID )
1760 {
1761 wxLogTrace(TRACE_STRCONV,
1762 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1763 ms_wcCharsetName.c_str(), cname.data());
1764 }
1765 }
1766 }
1767
1768 wxMBConv_iconv::~wxMBConv_iconv()
1769 {
1770 if ( m2w != ICONV_T_INVALID )
1771 iconv_close(m2w);
1772 if ( w2m != ICONV_T_INVALID )
1773 iconv_close(w2m);
1774 }
1775
1776 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1777 {
1778 // find the string length: notice that must be done differently for
1779 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1780 size_t inbuf;
1781 const size_t nulLen = GetMBNulLen();
1782 switch ( nulLen )
1783 {
1784 default:
1785 return wxCONV_FAILED;
1786
1787 case 1:
1788 inbuf = strlen(psz); // arguably more optimized than our version
1789 break;
1790
1791 case 2:
1792 case 4:
1793 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1794 // they also have to start at character boundary and not span two
1795 // adjacent characters
1796 const char *p;
1797 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1798 ;
1799 inbuf = p - psz;
1800 break;
1801 }
1802
1803 #if wxUSE_THREADS
1804 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1805 // Unfortunately there is a couple of global wxCSConv objects such as
1806 // wxConvLocal that are used all over wx code, so we have to make sure
1807 // the handle is used by at most one thread at the time. Otherwise
1808 // only a few wx classes would be safe to use from non-main threads
1809 // as MB<->WC conversion would fail "randomly".
1810 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1811 #endif // wxUSE_THREADS
1812
1813 size_t outbuf = n * SIZEOF_WCHAR_T;
1814 size_t res, cres;
1815 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1816 wchar_t *bufPtr = buf;
1817 const char *pszPtr = psz;
1818
1819 if (buf)
1820 {
1821 // have destination buffer, convert there
1822 cres = iconv(m2w,
1823 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1824 (char**)&bufPtr, &outbuf);
1825 res = n - (outbuf / SIZEOF_WCHAR_T);
1826
1827 if (ms_wcNeedsSwap)
1828 {
1829 // convert to native endianness
1830 for ( unsigned i = 0; i < res; i++ )
1831 buf[n] = WC_BSWAP(buf[i]);
1832 }
1833
1834 // NUL-terminate the string if there is any space left
1835 if (res < n)
1836 buf[res] = 0;
1837 }
1838 else
1839 {
1840 // no destination buffer... convert using temp buffer
1841 // to calculate destination buffer requirement
1842 wchar_t tbuf[8];
1843 res = 0;
1844
1845 do
1846 {
1847 bufPtr = tbuf;
1848 outbuf = 8 * SIZEOF_WCHAR_T;
1849
1850 cres = iconv(m2w,
1851 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1852 (char**)&bufPtr, &outbuf );
1853
1854 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1855 }
1856 while ((cres == (size_t)-1) && (errno == E2BIG));
1857 }
1858
1859 if (ICONV_FAILED(cres, inbuf))
1860 {
1861 //VS: it is ok if iconv fails, hence trace only
1862 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1863 return wxCONV_FAILED;
1864 }
1865
1866 return res;
1867 }
1868
1869 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1870 {
1871 #if wxUSE_THREADS
1872 // NB: explained in MB2WC
1873 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1874 #endif
1875
1876 size_t inlen = wxWcslen(psz);
1877 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1878 size_t outbuf = n;
1879 size_t res, cres;
1880
1881 wchar_t *tmpbuf = 0;
1882
1883 if (ms_wcNeedsSwap)
1884 {
1885 // need to copy to temp buffer to switch endianness
1886 // (doing WC_BSWAP twice on the original buffer won't help, as it
1887 // could be in read-only memory, or be accessed in some other thread)
1888 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1889 for ( size_t i = 0; i < inlen; i++ )
1890 tmpbuf[n] = WC_BSWAP(psz[i]);
1891
1892 tmpbuf[inlen] = L'\0';
1893 psz = tmpbuf;
1894 }
1895
1896 if (buf)
1897 {
1898 // have destination buffer, convert there
1899 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1900
1901 res = n - outbuf;
1902
1903 // NB: iconv was given only wcslen(psz) characters on input, and so
1904 // it couldn't convert the trailing zero. Let's do it ourselves
1905 // if there's some room left for it in the output buffer.
1906 if (res < n)
1907 buf[0] = 0;
1908 }
1909 else
1910 {
1911 // no destination buffer: convert using temp buffer
1912 // to calculate destination buffer requirement
1913 char tbuf[16];
1914 res = 0;
1915 do
1916 {
1917 buf = tbuf;
1918 outbuf = 16;
1919
1920 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1921
1922 res += 16 - outbuf;
1923 }
1924 while ((cres == (size_t)-1) && (errno == E2BIG));
1925 }
1926
1927 if (ms_wcNeedsSwap)
1928 {
1929 free(tmpbuf);
1930 }
1931
1932 if (ICONV_FAILED(cres, inbuf))
1933 {
1934 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1935 return wxCONV_FAILED;
1936 }
1937
1938 return res;
1939 }
1940
1941 size_t wxMBConv_iconv::GetMBNulLen() const
1942 {
1943 if ( m_minMBCharWidth == 0 )
1944 {
1945 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1946
1947 #if wxUSE_THREADS
1948 // NB: explained in MB2WC
1949 wxMutexLocker lock(self->m_iconvMutex);
1950 #endif
1951
1952 wchar_t *wnul = L"";
1953 char buf[8]; // should be enough for NUL in any encoding
1954 size_t inLen = sizeof(wchar_t),
1955 outLen = WXSIZEOF(buf);
1956 char *inBuff = (char *)wnul;
1957 char *outBuff = buf;
1958 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1959 {
1960 self->m_minMBCharWidth = (size_t)-1;
1961 }
1962 else // ok
1963 {
1964 self->m_minMBCharWidth = outBuff - buf;
1965 }
1966 }
1967
1968 return m_minMBCharWidth;
1969 }
1970
1971 #endif // HAVE_ICONV
1972
1973
1974 // ============================================================================
1975 // Win32 conversion classes
1976 // ============================================================================
1977
1978 #ifdef wxHAVE_WIN32_MB2WC
1979
1980 // from utils.cpp
1981 #if wxUSE_FONTMAP
1982 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1983 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1984 #endif
1985
1986 class wxMBConv_win32 : public wxMBConv
1987 {
1988 public:
1989 wxMBConv_win32()
1990 {
1991 m_CodePage = CP_ACP;
1992 m_minMBCharWidth = 0;
1993 }
1994
1995 wxMBConv_win32(const wxMBConv_win32& conv)
1996 {
1997 m_CodePage = conv.m_CodePage;
1998 m_minMBCharWidth = conv.m_minMBCharWidth;
1999 }
2000
2001 #if wxUSE_FONTMAP
2002 wxMBConv_win32(const wxChar* name)
2003 {
2004 m_CodePage = wxCharsetToCodepage(name);
2005 m_minMBCharWidth = 0;
2006 }
2007
2008 wxMBConv_win32(wxFontEncoding encoding)
2009 {
2010 m_CodePage = wxEncodingToCodepage(encoding);
2011 m_minMBCharWidth = 0;
2012 }
2013 #endif // wxUSE_FONTMAP
2014
2015 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2016 {
2017 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2018 // the behaviour is not compatible with the Unix version (using iconv)
2019 // and break the library itself, e.g. wxTextInputStream::NextChar()
2020 // wouldn't work if reading an incomplete MB char didn't result in an
2021 // error
2022 //
2023 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2024 // Win XP or newer and it is not supported for UTF-[78] so we always
2025 // use our own conversions in this case. See
2026 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2027 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2028 if ( m_CodePage == CP_UTF8 )
2029 {
2030 return wxConvUTF8.MB2WC(buf, psz, n);
2031 }
2032
2033 if ( m_CodePage == CP_UTF7 )
2034 {
2035 return wxConvUTF7.MB2WC(buf, psz, n);
2036 }
2037
2038 int flags = 0;
2039 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2040 IsAtLeastWin2kSP4() )
2041 {
2042 flags = MB_ERR_INVALID_CHARS;
2043 }
2044
2045 const size_t len = ::MultiByteToWideChar
2046 (
2047 m_CodePage, // code page
2048 flags, // flags: fall on error
2049 psz, // input string
2050 -1, // its length (NUL-terminated)
2051 buf, // output string
2052 buf ? n : 0 // size of output buffer
2053 );
2054 if ( !len )
2055 {
2056 // function totally failed
2057 return wxCONV_FAILED;
2058 }
2059
2060 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2061 // check if we succeeded, by doing a double trip:
2062 if ( !flags && buf )
2063 {
2064 const size_t mbLen = strlen(psz);
2065 wxCharBuffer mbBuf(mbLen);
2066 if ( ::WideCharToMultiByte
2067 (
2068 m_CodePage,
2069 0,
2070 buf,
2071 -1,
2072 mbBuf.data(),
2073 mbLen + 1, // size in bytes, not length
2074 NULL,
2075 NULL
2076 ) == 0 ||
2077 strcmp(mbBuf, psz) != 0 )
2078 {
2079 // we didn't obtain the same thing we started from, hence
2080 // the conversion was lossy and we consider that it failed
2081 return wxCONV_FAILED;
2082 }
2083 }
2084
2085 // note that it returns count of written chars for buf != NULL and size
2086 // of the needed buffer for buf == NULL so in either case the length of
2087 // the string (which never includes the terminating NUL) is one less
2088 return len - 1;
2089 }
2090
2091 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2092 {
2093 /*
2094 we have a problem here: by default, WideCharToMultiByte() may
2095 replace characters unrepresentable in the target code page with bad
2096 quality approximations such as turning "1/2" symbol (U+00BD) into
2097 "1" for the code pages which don't have it and we, obviously, want
2098 to avoid this at any price
2099
2100 the trouble is that this function does it _silently_, i.e. it won't
2101 even tell us whether it did or not... Win98/2000 and higher provide
2102 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2103 we have to resort to a round trip, i.e. check that converting back
2104 results in the same string -- this is, of course, expensive but
2105 otherwise we simply can't be sure to not garble the data.
2106 */
2107
2108 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2109 // it doesn't work with CJK encodings (which we test for rather roughly
2110 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2111 // supporting it
2112 BOOL usedDef wxDUMMY_INITIALIZE(false);
2113 BOOL *pUsedDef;
2114 int flags;
2115 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2116 {
2117 // it's our lucky day
2118 flags = WC_NO_BEST_FIT_CHARS;
2119 pUsedDef = &usedDef;
2120 }
2121 else // old system or unsupported encoding
2122 {
2123 flags = 0;
2124 pUsedDef = NULL;
2125 }
2126
2127 const size_t len = ::WideCharToMultiByte
2128 (
2129 m_CodePage, // code page
2130 flags, // either none or no best fit
2131 pwz, // input string
2132 -1, // it is (wide) NUL-terminated
2133 buf, // output buffer
2134 buf ? n : 0, // and its size
2135 NULL, // default "replacement" char
2136 pUsedDef // [out] was it used?
2137 );
2138
2139 if ( !len )
2140 {
2141 // function totally failed
2142 return wxCONV_FAILED;
2143 }
2144
2145 // if we were really converting, check if we succeeded
2146 if ( buf )
2147 {
2148 if ( flags )
2149 {
2150 // check if the conversion failed, i.e. if any replacements
2151 // were done
2152 if ( usedDef )
2153 return wxCONV_FAILED;
2154 }
2155 else // we must resort to double tripping...
2156 {
2157 wxWCharBuffer wcBuf(n);
2158 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2159 wcscmp(wcBuf, pwz) != 0 )
2160 {
2161 // we didn't obtain the same thing we started from, hence
2162 // the conversion was lossy and we consider that it failed
2163 return wxCONV_FAILED;
2164 }
2165 }
2166 }
2167
2168 // see the comment above for the reason of "len - 1"
2169 return len - 1;
2170 }
2171
2172 virtual size_t GetMBNulLen() const
2173 {
2174 if ( m_minMBCharWidth == 0 )
2175 {
2176 int len = ::WideCharToMultiByte
2177 (
2178 m_CodePage, // code page
2179 0, // no flags
2180 L"", // input string
2181 1, // translate just the NUL
2182 NULL, // output buffer
2183 0, // and its size
2184 NULL, // no replacement char
2185 NULL // [out] don't care if it was used
2186 );
2187
2188 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2189 switch ( len )
2190 {
2191 default:
2192 wxLogDebug(_T("Unexpected NUL length %d"), len);
2193 self->m_minMBCharWidth = (size_t)-1;
2194 break;
2195
2196 case 0:
2197 self->m_minMBCharWidth = (size_t)-1;
2198 break;
2199
2200 case 1:
2201 case 2:
2202 case 4:
2203 self->m_minMBCharWidth = len;
2204 break;
2205 }
2206 }
2207
2208 return m_minMBCharWidth;
2209 }
2210
2211 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2212
2213 bool IsOk() const { return m_CodePage != -1; }
2214
2215 private:
2216 static bool CanUseNoBestFit()
2217 {
2218 static int s_isWin98Or2k = -1;
2219
2220 if ( s_isWin98Or2k == -1 )
2221 {
2222 int verMaj, verMin;
2223 switch ( wxGetOsVersion(&verMaj, &verMin) )
2224 {
2225 case wxWIN95:
2226 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2227 break;
2228
2229 case wxWINDOWS_NT:
2230 s_isWin98Or2k = verMaj >= 5;
2231 break;
2232
2233 default:
2234 // unknown: be conservative by default
2235 s_isWin98Or2k = 0;
2236 break;
2237 }
2238
2239 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2240 }
2241
2242 return s_isWin98Or2k == 1;
2243 }
2244
2245 static bool IsAtLeastWin2kSP4()
2246 {
2247 #ifdef __WXWINCE__
2248 return false;
2249 #else
2250 static int s_isAtLeastWin2kSP4 = -1;
2251
2252 if ( s_isAtLeastWin2kSP4 == -1 )
2253 {
2254 OSVERSIONINFOEX ver;
2255
2256 memset(&ver, 0, sizeof(ver));
2257 ver.dwOSVersionInfoSize = sizeof(ver);
2258 GetVersionEx((OSVERSIONINFO*)&ver);
2259
2260 s_isAtLeastWin2kSP4 =
2261 ((ver.dwMajorVersion > 5) || // Vista+
2262 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2263 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2264 ver.wServicePackMajor >= 4)) // 2000 SP4+
2265 ? 1 : 0;
2266 }
2267
2268 return s_isAtLeastWin2kSP4 == 1;
2269 #endif
2270 }
2271
2272
2273 // the code page we're working with
2274 long m_CodePage;
2275
2276 // cached result of GetMBNulLen(), set to 0 initially meaning
2277 // "unknown"
2278 size_t m_minMBCharWidth;
2279 };
2280
2281 #endif // wxHAVE_WIN32_MB2WC
2282
2283 // ============================================================================
2284 // Cocoa conversion classes
2285 // ============================================================================
2286
2287 #if defined(__WXCOCOA__)
2288
2289 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2290 // Strangely enough, internally Core Foundation uses
2291 // UTF-32 internally quite a bit - its just not public (yet).
2292
2293 #include <CoreFoundation/CFString.h>
2294 #include <CoreFoundation/CFStringEncodingExt.h>
2295
2296 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2297 {
2298 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2299
2300 switch (encoding)
2301 {
2302 case wxFONTENCODING_DEFAULT :
2303 enc = CFStringGetSystemEncoding();
2304 break ;
2305
2306 case wxFONTENCODING_ISO8859_1 :
2307 enc = kCFStringEncodingISOLatin1 ;
2308 break ;
2309 case wxFONTENCODING_ISO8859_2 :
2310 enc = kCFStringEncodingISOLatin2;
2311 break ;
2312 case wxFONTENCODING_ISO8859_3 :
2313 enc = kCFStringEncodingISOLatin3 ;
2314 break ;
2315 case wxFONTENCODING_ISO8859_4 :
2316 enc = kCFStringEncodingISOLatin4;
2317 break ;
2318 case wxFONTENCODING_ISO8859_5 :
2319 enc = kCFStringEncodingISOLatinCyrillic;
2320 break ;
2321 case wxFONTENCODING_ISO8859_6 :
2322 enc = kCFStringEncodingISOLatinArabic;
2323 break ;
2324 case wxFONTENCODING_ISO8859_7 :
2325 enc = kCFStringEncodingISOLatinGreek;
2326 break ;
2327 case wxFONTENCODING_ISO8859_8 :
2328 enc = kCFStringEncodingISOLatinHebrew;
2329 break ;
2330 case wxFONTENCODING_ISO8859_9 :
2331 enc = kCFStringEncodingISOLatin5;
2332 break ;
2333 case wxFONTENCODING_ISO8859_10 :
2334 enc = kCFStringEncodingISOLatin6;
2335 break ;
2336 case wxFONTENCODING_ISO8859_11 :
2337 enc = kCFStringEncodingISOLatinThai;
2338 break ;
2339 case wxFONTENCODING_ISO8859_13 :
2340 enc = kCFStringEncodingISOLatin7;
2341 break ;
2342 case wxFONTENCODING_ISO8859_14 :
2343 enc = kCFStringEncodingISOLatin8;
2344 break ;
2345 case wxFONTENCODING_ISO8859_15 :
2346 enc = kCFStringEncodingISOLatin9;
2347 break ;
2348
2349 case wxFONTENCODING_KOI8 :
2350 enc = kCFStringEncodingKOI8_R;
2351 break ;
2352 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2353 enc = kCFStringEncodingDOSRussian;
2354 break ;
2355
2356 // case wxFONTENCODING_BULGARIAN :
2357 // enc = ;
2358 // break ;
2359
2360 case wxFONTENCODING_CP437 :
2361 enc = kCFStringEncodingDOSLatinUS ;
2362 break ;
2363 case wxFONTENCODING_CP850 :
2364 enc = kCFStringEncodingDOSLatin1;
2365 break ;
2366 case wxFONTENCODING_CP852 :
2367 enc = kCFStringEncodingDOSLatin2;
2368 break ;
2369 case wxFONTENCODING_CP855 :
2370 enc = kCFStringEncodingDOSCyrillic;
2371 break ;
2372 case wxFONTENCODING_CP866 :
2373 enc = kCFStringEncodingDOSRussian ;
2374 break ;
2375 case wxFONTENCODING_CP874 :
2376 enc = kCFStringEncodingDOSThai;
2377 break ;
2378 case wxFONTENCODING_CP932 :
2379 enc = kCFStringEncodingDOSJapanese;
2380 break ;
2381 case wxFONTENCODING_CP936 :
2382 enc = kCFStringEncodingDOSChineseSimplif ;
2383 break ;
2384 case wxFONTENCODING_CP949 :
2385 enc = kCFStringEncodingDOSKorean;
2386 break ;
2387 case wxFONTENCODING_CP950 :
2388 enc = kCFStringEncodingDOSChineseTrad;
2389 break ;
2390 case wxFONTENCODING_CP1250 :
2391 enc = kCFStringEncodingWindowsLatin2;
2392 break ;
2393 case wxFONTENCODING_CP1251 :
2394 enc = kCFStringEncodingWindowsCyrillic ;
2395 break ;
2396 case wxFONTENCODING_CP1252 :
2397 enc = kCFStringEncodingWindowsLatin1 ;
2398 break ;
2399 case wxFONTENCODING_CP1253 :
2400 enc = kCFStringEncodingWindowsGreek;
2401 break ;
2402 case wxFONTENCODING_CP1254 :
2403 enc = kCFStringEncodingWindowsLatin5;
2404 break ;
2405 case wxFONTENCODING_CP1255 :
2406 enc = kCFStringEncodingWindowsHebrew ;
2407 break ;
2408 case wxFONTENCODING_CP1256 :
2409 enc = kCFStringEncodingWindowsArabic ;
2410 break ;
2411 case wxFONTENCODING_CP1257 :
2412 enc = kCFStringEncodingWindowsBalticRim;
2413 break ;
2414 // This only really encodes to UTF7 (if that) evidently
2415 // case wxFONTENCODING_UTF7 :
2416 // enc = kCFStringEncodingNonLossyASCII ;
2417 // break ;
2418 case wxFONTENCODING_UTF8 :
2419 enc = kCFStringEncodingUTF8 ;
2420 break ;
2421 case wxFONTENCODING_EUC_JP :
2422 enc = kCFStringEncodingEUC_JP;
2423 break ;
2424 case wxFONTENCODING_UTF16 :
2425 enc = kCFStringEncodingUnicode ;
2426 break ;
2427 case wxFONTENCODING_MACROMAN :
2428 enc = kCFStringEncodingMacRoman ;
2429 break ;
2430 case wxFONTENCODING_MACJAPANESE :
2431 enc = kCFStringEncodingMacJapanese ;
2432 break ;
2433 case wxFONTENCODING_MACCHINESETRAD :
2434 enc = kCFStringEncodingMacChineseTrad ;
2435 break ;
2436 case wxFONTENCODING_MACKOREAN :
2437 enc = kCFStringEncodingMacKorean ;
2438 break ;
2439 case wxFONTENCODING_MACARABIC :
2440 enc = kCFStringEncodingMacArabic ;
2441 break ;
2442 case wxFONTENCODING_MACHEBREW :
2443 enc = kCFStringEncodingMacHebrew ;
2444 break ;
2445 case wxFONTENCODING_MACGREEK :
2446 enc = kCFStringEncodingMacGreek ;
2447 break ;
2448 case wxFONTENCODING_MACCYRILLIC :
2449 enc = kCFStringEncodingMacCyrillic ;
2450 break ;
2451 case wxFONTENCODING_MACDEVANAGARI :
2452 enc = kCFStringEncodingMacDevanagari ;
2453 break ;
2454 case wxFONTENCODING_MACGURMUKHI :
2455 enc = kCFStringEncodingMacGurmukhi ;
2456 break ;
2457 case wxFONTENCODING_MACGUJARATI :
2458 enc = kCFStringEncodingMacGujarati ;
2459 break ;
2460 case wxFONTENCODING_MACORIYA :
2461 enc = kCFStringEncodingMacOriya ;
2462 break ;
2463 case wxFONTENCODING_MACBENGALI :
2464 enc = kCFStringEncodingMacBengali ;
2465 break ;
2466 case wxFONTENCODING_MACTAMIL :
2467 enc = kCFStringEncodingMacTamil ;
2468 break ;
2469 case wxFONTENCODING_MACTELUGU :
2470 enc = kCFStringEncodingMacTelugu ;
2471 break ;
2472 case wxFONTENCODING_MACKANNADA :
2473 enc = kCFStringEncodingMacKannada ;
2474 break ;
2475 case wxFONTENCODING_MACMALAJALAM :
2476 enc = kCFStringEncodingMacMalayalam ;
2477 break ;
2478 case wxFONTENCODING_MACSINHALESE :
2479 enc = kCFStringEncodingMacSinhalese ;
2480 break ;
2481 case wxFONTENCODING_MACBURMESE :
2482 enc = kCFStringEncodingMacBurmese ;
2483 break ;
2484 case wxFONTENCODING_MACKHMER :
2485 enc = kCFStringEncodingMacKhmer ;
2486 break ;
2487 case wxFONTENCODING_MACTHAI :
2488 enc = kCFStringEncodingMacThai ;
2489 break ;
2490 case wxFONTENCODING_MACLAOTIAN :
2491 enc = kCFStringEncodingMacLaotian ;
2492 break ;
2493 case wxFONTENCODING_MACGEORGIAN :
2494 enc = kCFStringEncodingMacGeorgian ;
2495 break ;
2496 case wxFONTENCODING_MACARMENIAN :
2497 enc = kCFStringEncodingMacArmenian ;
2498 break ;
2499 case wxFONTENCODING_MACCHINESESIMP :
2500 enc = kCFStringEncodingMacChineseSimp ;
2501 break ;
2502 case wxFONTENCODING_MACTIBETAN :
2503 enc = kCFStringEncodingMacTibetan ;
2504 break ;
2505 case wxFONTENCODING_MACMONGOLIAN :
2506 enc = kCFStringEncodingMacMongolian ;
2507 break ;
2508 case wxFONTENCODING_MACETHIOPIC :
2509 enc = kCFStringEncodingMacEthiopic ;
2510 break ;
2511 case wxFONTENCODING_MACCENTRALEUR :
2512 enc = kCFStringEncodingMacCentralEurRoman ;
2513 break ;
2514 case wxFONTENCODING_MACVIATNAMESE :
2515 enc = kCFStringEncodingMacVietnamese ;
2516 break ;
2517 case wxFONTENCODING_MACARABICEXT :
2518 enc = kCFStringEncodingMacExtArabic ;
2519 break ;
2520 case wxFONTENCODING_MACSYMBOL :
2521 enc = kCFStringEncodingMacSymbol ;
2522 break ;
2523 case wxFONTENCODING_MACDINGBATS :
2524 enc = kCFStringEncodingMacDingbats ;
2525 break ;
2526 case wxFONTENCODING_MACTURKISH :
2527 enc = kCFStringEncodingMacTurkish ;
2528 break ;
2529 case wxFONTENCODING_MACCROATIAN :
2530 enc = kCFStringEncodingMacCroatian ;
2531 break ;
2532 case wxFONTENCODING_MACICELANDIC :
2533 enc = kCFStringEncodingMacIcelandic ;
2534 break ;
2535 case wxFONTENCODING_MACROMANIAN :
2536 enc = kCFStringEncodingMacRomanian ;
2537 break ;
2538 case wxFONTENCODING_MACCELTIC :
2539 enc = kCFStringEncodingMacCeltic ;
2540 break ;
2541 case wxFONTENCODING_MACGAELIC :
2542 enc = kCFStringEncodingMacGaelic ;
2543 break ;
2544 // case wxFONTENCODING_MACKEYBOARD :
2545 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2546 // break ;
2547
2548 default :
2549 // because gcc is picky
2550 break ;
2551 }
2552
2553 return enc ;
2554 }
2555
2556 class wxMBConv_cocoa : public wxMBConv
2557 {
2558 public:
2559 wxMBConv_cocoa()
2560 {
2561 Init(CFStringGetSystemEncoding()) ;
2562 }
2563
2564 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2565 {
2566 m_encoding = conv.m_encoding;
2567 }
2568
2569 #if wxUSE_FONTMAP
2570 wxMBConv_cocoa(const wxChar* name)
2571 {
2572 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2573 }
2574 #endif
2575
2576 wxMBConv_cocoa(wxFontEncoding encoding)
2577 {
2578 Init( wxCFStringEncFromFontEnc(encoding) );
2579 }
2580
2581 ~wxMBConv_cocoa()
2582 {
2583 }
2584
2585 void Init( CFStringEncoding encoding)
2586 {
2587 m_encoding = encoding ;
2588 }
2589
2590 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2591 {
2592 wxASSERT(szUnConv);
2593
2594 CFStringRef theString = CFStringCreateWithBytes (
2595 NULL, //the allocator
2596 (const UInt8*)szUnConv,
2597 strlen(szUnConv),
2598 m_encoding,
2599 false //no BOM/external representation
2600 );
2601
2602 wxASSERT(theString);
2603
2604 size_t nOutLength = CFStringGetLength(theString);
2605
2606 if (szOut == NULL)
2607 {
2608 CFRelease(theString);
2609 return nOutLength;
2610 }
2611
2612 CFRange theRange = { 0, nOutSize };
2613
2614 #if SIZEOF_WCHAR_T == 4
2615 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2616 #endif
2617
2618 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2619
2620 CFRelease(theString);
2621
2622 szUniCharBuffer[nOutLength] = '\0';
2623
2624 #if SIZEOF_WCHAR_T == 4
2625 wxMBConvUTF16 converter;
2626 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2627 delete [] szUniCharBuffer;
2628 #endif
2629
2630 return nOutLength;
2631 }
2632
2633 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2634 {
2635 wxASSERT(szUnConv);
2636
2637 size_t nRealOutSize;
2638 size_t nBufSize = wxWcslen(szUnConv);
2639 UniChar* szUniBuffer = (UniChar*) szUnConv;
2640
2641 #if SIZEOF_WCHAR_T == 4
2642 wxMBConvUTF16 converter ;
2643 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2644 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2645 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2646 nBufSize /= sizeof(UniChar);
2647 #endif
2648
2649 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2650 NULL, //allocator
2651 szUniBuffer,
2652 nBufSize,
2653 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2654 );
2655
2656 wxASSERT(theString);
2657
2658 //Note that CER puts a BOM when converting to unicode
2659 //so we check and use getchars instead in that case
2660 if (m_encoding == kCFStringEncodingUnicode)
2661 {
2662 if (szOut != NULL)
2663 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2664
2665 nRealOutSize = CFStringGetLength(theString) + 1;
2666 }
2667 else
2668 {
2669 CFStringGetBytes(
2670 theString,
2671 CFRangeMake(0, CFStringGetLength(theString)),
2672 m_encoding,
2673 0, //what to put in characters that can't be converted -
2674 //0 tells CFString to return NULL if it meets such a character
2675 false, //not an external representation
2676 (UInt8*) szOut,
2677 nOutSize,
2678 (CFIndex*) &nRealOutSize
2679 );
2680 }
2681
2682 CFRelease(theString);
2683
2684 #if SIZEOF_WCHAR_T == 4
2685 delete[] szUniBuffer;
2686 #endif
2687
2688 return nRealOutSize - 1;
2689 }
2690
2691 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2692
2693 bool IsOk() const
2694 {
2695 return m_encoding != kCFStringEncodingInvalidId &&
2696 CFStringIsEncodingAvailable(m_encoding);
2697 }
2698
2699 private:
2700 CFStringEncoding m_encoding ;
2701 };
2702
2703 #endif // defined(__WXCOCOA__)
2704
2705 // ============================================================================
2706 // Mac conversion classes
2707 // ============================================================================
2708
2709 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2710
2711 class wxMBConv_mac : public wxMBConv
2712 {
2713 public:
2714 wxMBConv_mac()
2715 {
2716 Init(CFStringGetSystemEncoding()) ;
2717 }
2718
2719 wxMBConv_mac(const wxMBConv_mac& conv)
2720 {
2721 Init(conv.m_char_encoding);
2722 }
2723
2724 #if wxUSE_FONTMAP
2725 wxMBConv_mac(const wxChar* name)
2726 {
2727 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2728 }
2729 #endif
2730
2731 wxMBConv_mac(wxFontEncoding encoding)
2732 {
2733 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2734 }
2735
2736 ~wxMBConv_mac()
2737 {
2738 OSStatus status = noErr ;
2739 status = TECDisposeConverter(m_MB2WC_converter);
2740 status = TECDisposeConverter(m_WC2MB_converter);
2741 }
2742
2743
2744 void Init( TextEncodingBase encoding)
2745 {
2746 OSStatus status = noErr ;
2747 m_char_encoding = encoding ;
2748 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2749
2750 status = TECCreateConverter(&m_MB2WC_converter,
2751 m_char_encoding,
2752 m_unicode_encoding);
2753 status = TECCreateConverter(&m_WC2MB_converter,
2754 m_unicode_encoding,
2755 m_char_encoding);
2756 }
2757
2758 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2759 {
2760 OSStatus status = noErr ;
2761 ByteCount byteOutLen ;
2762 ByteCount byteInLen = strlen(psz) ;
2763 wchar_t *tbuf = NULL ;
2764 UniChar* ubuf = NULL ;
2765 size_t res = 0 ;
2766
2767 if (buf == NULL)
2768 {
2769 // Apple specs say at least 32
2770 n = wxMax( 32, byteInLen ) ;
2771 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2772 }
2773
2774 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2775
2776 #if SIZEOF_WCHAR_T == 4
2777 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2778 #else
2779 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2780 #endif
2781
2782 status = TECConvertText(
2783 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2784 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2785
2786 #if SIZEOF_WCHAR_T == 4
2787 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2788 // is not properly terminated we get random characters at the end
2789 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2790 wxMBConvUTF16 converter ;
2791 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2792 free( ubuf ) ;
2793 #else
2794 res = byteOutLen / sizeof( UniChar ) ;
2795 #endif
2796
2797 if ( buf == NULL )
2798 free(tbuf) ;
2799
2800 if ( buf && res < n)
2801 buf[res] = 0;
2802
2803 return res ;
2804 }
2805
2806 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2807 {
2808 OSStatus status = noErr ;
2809 ByteCount byteOutLen ;
2810 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2811
2812 char *tbuf = NULL ;
2813
2814 if (buf == NULL)
2815 {
2816 // Apple specs say at least 32
2817 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2818 tbuf = (char*) malloc( n ) ;
2819 }
2820
2821 ByteCount byteBufferLen = n ;
2822 UniChar* ubuf = NULL ;
2823
2824 #if SIZEOF_WCHAR_T == 4
2825 wxMBConvUTF16 converter ;
2826 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2827 byteInLen = unicharlen ;
2828 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2829 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2830 #else
2831 ubuf = (UniChar*) psz ;
2832 #endif
2833
2834 status = TECConvertText(
2835 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2836 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2837
2838 #if SIZEOF_WCHAR_T == 4
2839 free( ubuf ) ;
2840 #endif
2841
2842 if ( buf == NULL )
2843 free(tbuf) ;
2844
2845 size_t res = byteOutLen ;
2846 if ( buf && res < n)
2847 {
2848 buf[res] = 0;
2849
2850 //we need to double-trip to verify it didn't insert any ? in place
2851 //of bogus characters
2852 wxWCharBuffer wcBuf(n);
2853 size_t pszlen = wxWcslen(psz);
2854 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2855 wxWcslen(wcBuf) != pszlen ||
2856 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2857 {
2858 // we didn't obtain the same thing we started from, hence
2859 // the conversion was lossy and we consider that it failed
2860 return wxCONV_FAILED;
2861 }
2862 }
2863
2864 return res ;
2865 }
2866
2867 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2868
2869 bool IsOk() const
2870 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2871
2872 private:
2873 TECObjectRef m_MB2WC_converter;
2874 TECObjectRef m_WC2MB_converter;
2875
2876 TextEncodingBase m_char_encoding;
2877 TextEncodingBase m_unicode_encoding;
2878 };
2879
2880 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2881
2882 // ============================================================================
2883 // wxEncodingConverter based conversion classes
2884 // ============================================================================
2885
2886 #if wxUSE_FONTMAP
2887
2888 class wxMBConv_wxwin : public wxMBConv
2889 {
2890 private:
2891 void Init()
2892 {
2893 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2894 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2895 }
2896
2897 public:
2898 // temporarily just use wxEncodingConverter stuff,
2899 // so that it works while a better implementation is built
2900 wxMBConv_wxwin(const wxChar* name)
2901 {
2902 if (name)
2903 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2904 else
2905 m_enc = wxFONTENCODING_SYSTEM;
2906
2907 Init();
2908 }
2909
2910 wxMBConv_wxwin(wxFontEncoding enc)
2911 {
2912 m_enc = enc;
2913
2914 Init();
2915 }
2916
2917 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2918 {
2919 size_t inbuf = strlen(psz);
2920 if (buf)
2921 {
2922 if (!m2w.Convert(psz, buf))
2923 return wxCONV_FAILED;
2924 }
2925 return inbuf;
2926 }
2927
2928 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2929 {
2930 const size_t inbuf = wxWcslen(psz);
2931 if (buf)
2932 {
2933 if (!w2m.Convert(psz, buf))
2934 return wxCONV_FAILED;
2935 }
2936
2937 return inbuf;
2938 }
2939
2940 virtual size_t GetMBNulLen() const
2941 {
2942 switch ( m_enc )
2943 {
2944 case wxFONTENCODING_UTF16BE:
2945 case wxFONTENCODING_UTF16LE:
2946 return 2;
2947
2948 case wxFONTENCODING_UTF32BE:
2949 case wxFONTENCODING_UTF32LE:
2950 return 4;
2951
2952 default:
2953 return 1;
2954 }
2955 }
2956
2957 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2958
2959 bool IsOk() const { return m_ok; }
2960
2961 public:
2962 wxFontEncoding m_enc;
2963 wxEncodingConverter m2w, w2m;
2964
2965 private:
2966 // were we initialized successfully?
2967 bool m_ok;
2968
2969 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2970 };
2971
2972 // make the constructors available for unit testing
2973 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2974 {
2975 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2976 if ( !result->IsOk() )
2977 {
2978 delete result;
2979 return 0;
2980 }
2981
2982 return result;
2983 }
2984
2985 #endif // wxUSE_FONTMAP
2986
2987 // ============================================================================
2988 // wxCSConv implementation
2989 // ============================================================================
2990
2991 void wxCSConv::Init()
2992 {
2993 m_name = NULL;
2994 m_convReal = NULL;
2995 m_deferred = true;
2996 }
2997
2998 wxCSConv::wxCSConv(const wxChar *charset)
2999 {
3000 Init();
3001
3002 if ( charset )
3003 {
3004 SetName(charset);
3005 }
3006
3007 #if wxUSE_FONTMAP
3008 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3009 #else
3010 m_encoding = wxFONTENCODING_SYSTEM;
3011 #endif
3012 }
3013
3014 wxCSConv::wxCSConv(wxFontEncoding encoding)
3015 {
3016 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3017 {
3018 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3019
3020 encoding = wxFONTENCODING_SYSTEM;
3021 }
3022
3023 Init();
3024
3025 m_encoding = encoding;
3026 }
3027
3028 wxCSConv::~wxCSConv()
3029 {
3030 Clear();
3031 }
3032
3033 wxCSConv::wxCSConv(const wxCSConv& conv)
3034 : wxMBConv()
3035 {
3036 Init();
3037
3038 SetName(conv.m_name);
3039 m_encoding = conv.m_encoding;
3040 }
3041
3042 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3043 {
3044 Clear();
3045
3046 SetName(conv.m_name);
3047 m_encoding = conv.m_encoding;
3048
3049 return *this;
3050 }
3051
3052 void wxCSConv::Clear()
3053 {
3054 free(m_name);
3055 delete m_convReal;
3056
3057 m_name = NULL;
3058 m_convReal = NULL;
3059 }
3060
3061 void wxCSConv::SetName(const wxChar *charset)
3062 {
3063 if (charset)
3064 {
3065 m_name = wxStrdup(charset);
3066 m_deferred = true;
3067 }
3068 }
3069
3070 #if wxUSE_FONTMAP
3071 #include "wx/hashmap.h"
3072
3073 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3074 wxEncodingNameCache );
3075
3076 static wxEncodingNameCache gs_nameCache;
3077 #endif
3078
3079 wxMBConv *wxCSConv::DoCreate() const
3080 {
3081 #if wxUSE_FONTMAP
3082 wxLogTrace(TRACE_STRCONV,
3083 wxT("creating conversion for %s"),
3084 (m_name ? m_name
3085 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3086 #endif // wxUSE_FONTMAP
3087
3088 // check for the special case of ASCII or ISO8859-1 charset: as we have
3089 // special knowledge of it anyhow, we don't need to create a special
3090 // conversion object
3091 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3092 m_encoding == wxFONTENCODING_DEFAULT )
3093 {
3094 // don't convert at all
3095 return NULL;
3096 }
3097
3098 // we trust OS to do conversion better than we can so try external
3099 // conversion methods first
3100 //
3101 // the full order is:
3102 // 1. OS conversion (iconv() under Unix or Win32 API)
3103 // 2. hard coded conversions for UTF
3104 // 3. wxEncodingConverter as fall back
3105
3106 // step (1)
3107 #ifdef HAVE_ICONV
3108 #if !wxUSE_FONTMAP
3109 if ( m_name )
3110 #endif // !wxUSE_FONTMAP
3111 {
3112 wxString name(m_name);
3113 wxFontEncoding encoding(m_encoding);
3114
3115 if ( !name.empty() )
3116 {
3117 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3118 if ( conv->IsOk() )
3119 return conv;
3120
3121 delete conv;
3122
3123 #if wxUSE_FONTMAP
3124 encoding =
3125 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3126 #endif // wxUSE_FONTMAP
3127 }
3128 #if wxUSE_FONTMAP
3129 {
3130 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3131 if ( it != gs_nameCache.end() )
3132 {
3133 if ( it->second.empty() )
3134 return NULL;
3135
3136 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3137 if ( conv->IsOk() )
3138 return conv;
3139
3140 delete conv;
3141 }
3142
3143 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3144
3145 for ( ; *names; ++names )
3146 {
3147 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3148 if ( conv->IsOk() )
3149 {
3150 gs_nameCache[encoding] = *names;
3151 return conv;
3152 }
3153
3154 delete conv;
3155 }
3156
3157 gs_nameCache[encoding] = _T(""); // cache the failure
3158 }
3159 #endif // wxUSE_FONTMAP
3160 }
3161 #endif // HAVE_ICONV
3162
3163 #ifdef wxHAVE_WIN32_MB2WC
3164 {
3165 #if wxUSE_FONTMAP
3166 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3167 : new wxMBConv_win32(m_encoding);
3168 if ( conv->IsOk() )
3169 return conv;
3170
3171 delete conv;
3172 #else
3173 return NULL;
3174 #endif
3175 }
3176 #endif // wxHAVE_WIN32_MB2WC
3177
3178 #if defined(__WXMAC__)
3179 {
3180 // leave UTF16 and UTF32 to the built-ins of wx
3181 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3182 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3183 {
3184 #if wxUSE_FONTMAP
3185 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3186 : new wxMBConv_mac(m_encoding);
3187 #else
3188 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3189 #endif
3190 if ( conv->IsOk() )
3191 return conv;
3192
3193 delete conv;
3194 }
3195 }
3196 #endif
3197
3198 #if defined(__WXCOCOA__)
3199 {
3200 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3201 {
3202 #if wxUSE_FONTMAP
3203 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3204 : new wxMBConv_cocoa(m_encoding);
3205 #else
3206 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3207 #endif
3208
3209 if ( conv->IsOk() )
3210 return conv;
3211
3212 delete conv;
3213 }
3214 }
3215 #endif
3216 // step (2)
3217 wxFontEncoding enc = m_encoding;
3218 #if wxUSE_FONTMAP
3219 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3220 {
3221 // use "false" to suppress interactive dialogs -- we can be called from
3222 // anywhere and popping up a dialog from here is the last thing we want to
3223 // do
3224 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3225 }
3226 #endif // wxUSE_FONTMAP
3227
3228 switch ( enc )
3229 {
3230 case wxFONTENCODING_UTF7:
3231 return new wxMBConvUTF7;
3232
3233 case wxFONTENCODING_UTF8:
3234 return new wxMBConvUTF8;
3235
3236 case wxFONTENCODING_UTF16BE:
3237 return new wxMBConvUTF16BE;
3238
3239 case wxFONTENCODING_UTF16LE:
3240 return new wxMBConvUTF16LE;
3241
3242 case wxFONTENCODING_UTF32BE:
3243 return new wxMBConvUTF32BE;
3244
3245 case wxFONTENCODING_UTF32LE:
3246 return new wxMBConvUTF32LE;
3247
3248 default:
3249 // nothing to do but put here to suppress gcc warnings
3250 break;
3251 }
3252
3253 // step (3)
3254 #if wxUSE_FONTMAP
3255 {
3256 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3257 : new wxMBConv_wxwin(m_encoding);
3258 if ( conv->IsOk() )
3259 return conv;
3260
3261 delete conv;
3262 }
3263 #endif // wxUSE_FONTMAP
3264
3265 // NB: This is a hack to prevent deadlock. What could otherwise happen
3266 // in Unicode build: wxConvLocal creation ends up being here
3267 // because of some failure and logs the error. But wxLog will try to
3268 // attach timestamp, for which it will need wxConvLocal (to convert
3269 // time to char* and then wchar_t*), but that fails, tries to log
3270 // error, but wxLog has a (already locked) critical section that
3271 // guards static buffer.
3272 static bool alreadyLoggingError = false;
3273 if (!alreadyLoggingError)
3274 {
3275 alreadyLoggingError = true;
3276 wxLogError(_("Cannot convert from the charset '%s'!"),
3277 m_name ? m_name
3278 :
3279 #if wxUSE_FONTMAP
3280 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3281 #else // !wxUSE_FONTMAP
3282 wxString::Format(_("encoding %s"), m_encoding).c_str()
3283 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3284 );
3285
3286 alreadyLoggingError = false;
3287 }
3288
3289 return NULL;
3290 }
3291
3292 void wxCSConv::CreateConvIfNeeded() const
3293 {
3294 if ( m_deferred )
3295 {
3296 wxCSConv *self = (wxCSConv *)this; // const_cast
3297
3298 #if wxUSE_INTL
3299 // if we don't have neither the name nor the encoding, use the default
3300 // encoding for this system
3301 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3302 {
3303 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3304 }
3305 #endif // wxUSE_INTL
3306
3307 self->m_convReal = DoCreate();
3308 self->m_deferred = false;
3309 }
3310 }
3311
3312 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3313 {
3314 CreateConvIfNeeded();
3315
3316 if (m_convReal)
3317 return m_convReal->MB2WC(buf, psz, n);
3318
3319 // latin-1 (direct)
3320 size_t len = strlen(psz);
3321
3322 if (buf)
3323 {
3324 for (size_t c = 0; c <= len; c++)
3325 buf[c] = (unsigned char)(psz[c]);
3326 }
3327
3328 return len;
3329 }
3330
3331 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3332 {
3333 CreateConvIfNeeded();
3334
3335 if (m_convReal)
3336 return m_convReal->WC2MB(buf, psz, n);
3337
3338 // latin-1 (direct)
3339 const size_t len = wxWcslen(psz);
3340 if (buf)
3341 {
3342 for (size_t c = 0; c <= len; c++)
3343 {
3344 if (psz[c] > 0xFF)
3345 return wxCONV_FAILED;
3346
3347 buf[c] = (char)psz[c];
3348 }
3349 }
3350 else
3351 {
3352 for (size_t c = 0; c <= len; c++)
3353 {
3354 if (psz[c] > 0xFF)
3355 return wxCONV_FAILED;
3356 }
3357 }
3358
3359 return len;
3360 }
3361
3362 size_t wxCSConv::GetMBNulLen() const
3363 {
3364 CreateConvIfNeeded();
3365
3366 if ( m_convReal )
3367 {
3368 return m_convReal->GetMBNulLen();
3369 }
3370
3371 return 1;
3372 }
3373
3374 // ----------------------------------------------------------------------------
3375 // globals
3376 // ----------------------------------------------------------------------------
3377
3378 #ifdef __WINDOWS__
3379 static wxMBConv_win32 wxConvLibcObj;
3380 #elif defined(__WXMAC__) && !defined(__MACH__)
3381 static wxMBConv_mac wxConvLibcObj ;
3382 #else
3383 static wxMBConvLibc wxConvLibcObj;
3384 #endif
3385
3386 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3387 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3388 static wxMBConvUTF7 wxConvUTF7Obj;
3389 static wxMBConvUTF8 wxConvUTF8Obj;
3390
3391 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3392 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3393 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3394 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3395 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3396 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3397 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3398 #ifdef __WXOSX__
3399 wxConvUTF8Obj;
3400 #else
3401 wxConvLibcObj;
3402 #endif
3403
3404
3405 #else // !wxUSE_WCHAR_T
3406
3407 // stand-ins in absence of wchar_t
3408 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3409 wxConvISO8859_1,
3410 wxConvLocal,
3411 wxConvUTF8;
3412
3413 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T