implemented wxMBConv::IsUTF8() helper for more classes so that all uses of UTF-8...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef __SALFORDC__
48 #include <clib.h>
49 #endif
50
51 #ifdef HAVE_ICONV
52 #include <iconv.h>
53 #include "wx/thread.h"
54 #endif
55
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
58
59 #ifdef __WXMAC__
60 #ifndef __DARWIN__
61 #include <ATSUnicode.h>
62 #include <TextCommon.h>
63 #include <TextEncodingConverter.h>
64 #endif
65
66 // includes Mac headers
67 #include "wx/mac/private.h"
68 #endif
69
70
71 #define TRACE_STRCONV _T("strconv")
72
73 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
74 // be 4 bytes
75 #if SIZEOF_WCHAR_T == 2
76 #define WC_UTF16
77 #endif
78
79
80 // ============================================================================
81 // implementation
82 // ============================================================================
83
84 // helper function of cMB2WC(): check if n bytes at this location are all NUL
85 static bool NotAllNULs(const char *p, size_t n)
86 {
87 while ( n && *p++ == '\0' )
88 n--;
89
90 return n != 0;
91 }
92
93 // ----------------------------------------------------------------------------
94 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
95 // ----------------------------------------------------------------------------
96
97 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
98 {
99 if (input <= 0xffff)
100 {
101 if (output)
102 *output = (wxUint16) input;
103
104 return 1;
105 }
106 else if (input >= 0x110000)
107 {
108 return wxCONV_FAILED;
109 }
110 else
111 {
112 if (output)
113 {
114 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
115 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
116 }
117
118 return 2;
119 }
120 }
121
122 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
123 {
124 if ((*input < 0xd800) || (*input > 0xdfff))
125 {
126 output = *input;
127 return 1;
128 }
129 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
130 {
131 output = *input;
132 return wxCONV_FAILED;
133 }
134 else
135 {
136 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
137 return 2;
138 }
139 }
140
141 #ifdef WC_UTF16
142 typedef wchar_t wxDecodeSurrogate_t;
143 #else // !WC_UTF16
144 typedef wxUint16 wxDecodeSurrogate_t;
145 #endif // WC_UTF16/!WC_UTF16
146
147 // returns the next UTF-32 character from the wchar_t buffer and advances the
148 // pointer to the character after this one
149 //
150 // if an invalid character is found, *pSrc is set to NULL, the caller must
151 // check for this
152 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
153 {
154 wxUint32 out;
155 const size_t
156 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
157 if ( n == wxCONV_FAILED )
158 *pSrc = NULL;
159 else
160 *pSrc += n;
161
162 return out;
163 }
164
165 // ----------------------------------------------------------------------------
166 // wxMBConv
167 // ----------------------------------------------------------------------------
168
169 size_t
170 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
171 const char *src, size_t srcLen) const
172 {
173 // although new conversion classes are supposed to implement this function
174 // directly, the existins ones only implement the old MB2WC() and so, to
175 // avoid to have to rewrite all conversion classes at once, we provide a
176 // default (but not efficient) implementation of this one in terms of the
177 // old function by copying the input to ensure that it's NUL-terminated and
178 // then using MB2WC() to convert it
179
180 // the number of chars [which would be] written to dst [if it were not NULL]
181 size_t dstWritten = 0;
182
183 // the number of NULs terminating this string
184 size_t nulLen = 0; // not really needed, but just to avoid warnings
185
186 // if we were not given the input size we just have to assume that the
187 // string is properly terminated as we have no way of knowing how long it
188 // is anyhow, but if we do have the size check whether there are enough
189 // NULs at the end
190 wxCharBuffer bufTmp;
191 const char *srcEnd;
192 if ( srcLen != wxNO_LEN )
193 {
194 // we need to know how to find the end of this string
195 nulLen = GetMBNulLen();
196 if ( nulLen == wxCONV_FAILED )
197 return wxCONV_FAILED;
198
199 // if there are enough NULs we can avoid the copy
200 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
201 {
202 // make a copy in order to properly NUL-terminate the string
203 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
204 char * const p = bufTmp.data();
205 memcpy(p, src, srcLen);
206 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
207 *s = '\0';
208
209 src = bufTmp;
210 }
211
212 srcEnd = src + srcLen;
213 }
214 else // quit after the first loop iteration
215 {
216 srcEnd = NULL;
217 }
218
219 for ( ;; )
220 {
221 // try to convert the current chunk
222 size_t lenChunk = MB2WC(NULL, src, 0);
223 if ( lenChunk == wxCONV_FAILED )
224 return wxCONV_FAILED;
225
226 lenChunk++; // for the L'\0' at the end of this chunk
227
228 dstWritten += lenChunk;
229
230 if ( lenChunk == 1 )
231 {
232 // nothing left in the input string, conversion succeeded
233 break;
234 }
235
236 if ( dst )
237 {
238 if ( dstWritten > dstLen )
239 return wxCONV_FAILED;
240
241 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
242 return wxCONV_FAILED;
243
244 dst += lenChunk;
245 }
246
247 if ( !srcEnd )
248 {
249 // we convert just one chunk in this case as this is the entire
250 // string anyhow
251 break;
252 }
253
254 // advance the input pointer past the end of this chunk
255 while ( NotAllNULs(src, nulLen) )
256 {
257 // notice that we must skip over multiple bytes here as we suppose
258 // that if NUL takes 2 or 4 bytes, then all the other characters do
259 // too and so if advanced by a single byte we might erroneously
260 // detect sequences of NUL bytes in the middle of the input
261 src += nulLen;
262 }
263
264 src += nulLen; // skipping over its terminator as well
265
266 // note that ">=" (and not just "==") is needed here as the terminator
267 // we skipped just above could be inside or just after the buffer
268 // delimited by inEnd
269 if ( src >= srcEnd )
270 break;
271 }
272
273 return dstWritten;
274 }
275
276 size_t
277 wxMBConv::FromWChar(char *dst, size_t dstLen,
278 const wchar_t *src, size_t srcLen) const
279 {
280 // the number of chars [which would be] written to dst [if it were not NULL]
281 size_t dstWritten = 0;
282
283 // make a copy of the input string unless it is already properly
284 // NUL-terminated
285 //
286 // if we don't know its length we have no choice but to assume that it is,
287 // indeed, properly terminated
288 wxWCharBuffer bufTmp;
289 if ( srcLen == wxNO_LEN )
290 {
291 srcLen = wxWcslen(src) + 1;
292 }
293 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
294 {
295 // make a copy in order to properly NUL-terminate the string
296 bufTmp = wxWCharBuffer(srcLen);
297 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
298 src = bufTmp;
299 }
300
301 const size_t lenNul = GetMBNulLen();
302 for ( const wchar_t * const srcEnd = src + srcLen;
303 src < srcEnd;
304 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
305 {
306 // try to convert the current chunk
307 size_t lenChunk = WC2MB(NULL, src, 0);
308
309 if ( lenChunk == wxCONV_FAILED )
310 return wxCONV_FAILED;
311
312 lenChunk += lenNul;
313 dstWritten += lenChunk;
314
315 if ( dst )
316 {
317 if ( dstWritten > dstLen )
318 return wxCONV_FAILED;
319
320 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
321 return wxCONV_FAILED;
322
323 dst += lenChunk;
324 }
325 }
326
327 return dstWritten;
328 }
329
330 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
331 {
332 size_t rc = ToWChar(outBuff, outLen, inBuff);
333 if ( rc != wxCONV_FAILED )
334 {
335 // ToWChar() returns the buffer length, i.e. including the trailing
336 // NUL, while this method doesn't take it into account
337 rc--;
338 }
339
340 return rc;
341 }
342
343 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
344 {
345 size_t rc = FromWChar(outBuff, outLen, inBuff);
346 if ( rc != wxCONV_FAILED )
347 {
348 rc -= GetMBNulLen();
349 }
350
351 return rc;
352 }
353
354 wxMBConv::~wxMBConv()
355 {
356 // nothing to do here (necessary for Darwin linking probably)
357 }
358
359 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
360 {
361 if ( psz )
362 {
363 // calculate the length of the buffer needed first
364 const size_t nLen = MB2WC(NULL, psz, 0);
365 if ( nLen != wxCONV_FAILED )
366 {
367 // now do the actual conversion
368 wxWCharBuffer buf(nLen /* +1 added implicitly */);
369
370 // +1 for the trailing NULL
371 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
372 return buf;
373 }
374 }
375
376 return wxWCharBuffer();
377 }
378
379 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
380 {
381 if ( pwz )
382 {
383 const size_t nLen = WC2MB(NULL, pwz, 0);
384 if ( nLen != wxCONV_FAILED )
385 {
386 // extra space for trailing NUL(s)
387 static const size_t extraLen = GetMaxMBNulLen();
388
389 wxCharBuffer buf(nLen + extraLen - 1);
390 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
391 return buf;
392 }
393 }
394
395 return wxCharBuffer();
396 }
397
398 const wxWCharBuffer
399 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
400 {
401 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
402 if ( dstLen != wxCONV_FAILED )
403 {
404 wxWCharBuffer wbuf(dstLen - 1);
405 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
406 {
407 if ( outLen )
408 {
409 *outLen = dstLen;
410 if ( wbuf[dstLen - 1] == L'\0' )
411 (*outLen)--;
412 }
413
414 return wbuf;
415 }
416 }
417
418 if ( outLen )
419 *outLen = 0;
420
421 return wxWCharBuffer();
422 }
423
424 const wxCharBuffer
425 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
426 {
427 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
428 if ( dstLen != wxCONV_FAILED )
429 {
430 // special case of empty input: can't allocate 0 size buffer below as
431 // wxCharBuffer insists on NUL-terminating it
432 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
433 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
434 {
435 if ( outLen )
436 {
437 *outLen = dstLen;
438
439 const size_t nulLen = GetMBNulLen();
440 if ( dstLen >= nulLen &&
441 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
442 {
443 // in this case the output is NUL-terminated and we're not
444 // supposed to count NUL
445 *outLen -= nulLen;
446 }
447 }
448
449 return buf;
450 }
451 }
452
453 if ( outLen )
454 *outLen = 0;
455
456 return wxCharBuffer();
457 }
458
459 // ----------------------------------------------------------------------------
460 // wxMBConvLibc
461 // ----------------------------------------------------------------------------
462
463 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
464 {
465 return wxMB2WC(buf, psz, n);
466 }
467
468 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
469 {
470 return wxWC2MB(buf, psz, n);
471 }
472
473 // ----------------------------------------------------------------------------
474 // wxConvBrokenFileNames
475 // ----------------------------------------------------------------------------
476
477 #ifdef __UNIX__
478
479 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
480 {
481 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
482 || wxStricmp(charset, _T("UTF8")) == 0 )
483 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
484 else
485 m_conv = new wxCSConv(charset);
486 }
487
488 #endif // __UNIX__
489
490 // ----------------------------------------------------------------------------
491 // UTF-7
492 // ----------------------------------------------------------------------------
493
494 // Implementation (C) 2004 Fredrik Roubert
495
496 //
497 // BASE64 decoding table
498 //
499 static const unsigned char utf7unb64[] =
500 {
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
506 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
507 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
508 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
510 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
511 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
512 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
514 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
515 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
516 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
533 };
534
535 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
536 {
537 size_t len = 0;
538
539 while ( *psz && (!buf || (len < n)) )
540 {
541 unsigned char cc = *psz++;
542 if (cc != '+')
543 {
544 // plain ASCII char
545 if (buf)
546 *buf++ = cc;
547 len++;
548 }
549 else if (*psz == '-')
550 {
551 // encoded plus sign
552 if (buf)
553 *buf++ = cc;
554 len++;
555 psz++;
556 }
557 else // start of BASE64 encoded string
558 {
559 bool lsb, ok;
560 unsigned int d, l;
561 for ( ok = lsb = false, d = 0, l = 0;
562 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
563 psz++ )
564 {
565 d <<= 6;
566 d += cc;
567 for (l += 6; l >= 8; lsb = !lsb)
568 {
569 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
570 if (lsb)
571 {
572 if (buf)
573 *buf++ |= c;
574 len ++;
575 }
576 else
577 {
578 if (buf)
579 *buf = (wchar_t)(c << 8);
580 }
581
582 ok = true;
583 }
584 }
585
586 if ( !ok )
587 {
588 // in valid UTF7 we should have valid characters after '+'
589 return wxCONV_FAILED;
590 }
591
592 if (*psz == '-')
593 psz++;
594 }
595 }
596
597 if ( buf && (len < n) )
598 *buf = '\0';
599
600 return len;
601 }
602
603 //
604 // BASE64 encoding table
605 //
606 static const unsigned char utf7enb64[] =
607 {
608 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
609 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
610 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
611 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
612 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
613 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
614 'w', 'x', 'y', 'z', '0', '1', '2', '3',
615 '4', '5', '6', '7', '8', '9', '+', '/'
616 };
617
618 //
619 // UTF-7 encoding table
620 //
621 // 0 - Set D (directly encoded characters)
622 // 1 - Set O (optional direct characters)
623 // 2 - whitespace characters (optional)
624 // 3 - special characters
625 //
626 static const unsigned char utf7encode[128] =
627 {
628 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
629 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
630 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
632 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
634 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
635 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
636 };
637
638 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
639 {
640 size_t len = 0;
641
642 while (*psz && ((!buf) || (len < n)))
643 {
644 wchar_t cc = *psz++;
645 if (cc < 0x80 && utf7encode[cc] < 1)
646 {
647 // plain ASCII char
648 if (buf)
649 *buf++ = (char)cc;
650
651 len++;
652 }
653 #ifndef WC_UTF16
654 else if (((wxUint32)cc) > 0xffff)
655 {
656 // no surrogate pair generation (yet?)
657 return wxCONV_FAILED;
658 }
659 #endif
660 else
661 {
662 if (buf)
663 *buf++ = '+';
664
665 len++;
666 if (cc != '+')
667 {
668 // BASE64 encode string
669 unsigned int lsb, d, l;
670 for (d = 0, l = 0; /*nothing*/; psz++)
671 {
672 for (lsb = 0; lsb < 2; lsb ++)
673 {
674 d <<= 8;
675 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
676
677 for (l += 8; l >= 6; )
678 {
679 l -= 6;
680 if (buf)
681 *buf++ = utf7enb64[(d >> l) % 64];
682 len++;
683 }
684 }
685
686 cc = *psz;
687 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
688 break;
689 }
690
691 if (l != 0)
692 {
693 if (buf)
694 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
695
696 len++;
697 }
698 }
699
700 if (buf)
701 *buf++ = '-';
702 len++;
703 }
704 }
705
706 if (buf && (len < n))
707 *buf = 0;
708
709 return len;
710 }
711
712 // ----------------------------------------------------------------------------
713 // UTF-8
714 // ----------------------------------------------------------------------------
715
716 static wxUint32 utf8_max[]=
717 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
718
719 // boundaries of the private use area we use to (temporarily) remap invalid
720 // characters invalid in a UTF-8 encoded string
721 const wxUint32 wxUnicodePUA = 0x100000;
722 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
723
724 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
725 {
726 size_t len = 0;
727
728 while (*psz && ((!buf) || (len < n)))
729 {
730 const char *opsz = psz;
731 bool invalid = false;
732 unsigned char cc = *psz++, fc = cc;
733 unsigned cnt;
734 for (cnt = 0; fc & 0x80; cnt++)
735 fc <<= 1;
736
737 if (!cnt)
738 {
739 // plain ASCII char
740 if (buf)
741 *buf++ = cc;
742 len++;
743
744 // escape the escape character for octal escapes
745 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
746 && cc == '\\' && (!buf || len < n))
747 {
748 if (buf)
749 *buf++ = cc;
750 len++;
751 }
752 }
753 else
754 {
755 cnt--;
756 if (!cnt)
757 {
758 // invalid UTF-8 sequence
759 invalid = true;
760 }
761 else
762 {
763 unsigned ocnt = cnt - 1;
764 wxUint32 res = cc & (0x3f >> cnt);
765 while (cnt--)
766 {
767 cc = *psz;
768 if ((cc & 0xC0) != 0x80)
769 {
770 // invalid UTF-8 sequence
771 invalid = true;
772 break;
773 }
774
775 psz++;
776 res = (res << 6) | (cc & 0x3f);
777 }
778
779 if (invalid || res <= utf8_max[ocnt])
780 {
781 // illegal UTF-8 encoding
782 invalid = true;
783 }
784 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
785 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
786 {
787 // if one of our PUA characters turns up externally
788 // it must also be treated as an illegal sequence
789 // (a bit like you have to escape an escape character)
790 invalid = true;
791 }
792 else
793 {
794 #ifdef WC_UTF16
795 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
796 size_t pa = encode_utf16(res, (wxUint16 *)buf);
797 if (pa == wxCONV_FAILED)
798 {
799 invalid = true;
800 }
801 else
802 {
803 if (buf)
804 buf += pa;
805 len += pa;
806 }
807 #else // !WC_UTF16
808 if (buf)
809 *buf++ = (wchar_t)res;
810 len++;
811 #endif // WC_UTF16/!WC_UTF16
812 }
813 }
814
815 if (invalid)
816 {
817 if (m_options & MAP_INVALID_UTF8_TO_PUA)
818 {
819 while (opsz < psz && (!buf || len < n))
820 {
821 #ifdef WC_UTF16
822 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
823 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
824 wxASSERT(pa != wxCONV_FAILED);
825 if (buf)
826 buf += pa;
827 opsz++;
828 len += pa;
829 #else
830 if (buf)
831 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
832 opsz++;
833 len++;
834 #endif
835 }
836 }
837 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
838 {
839 while (opsz < psz && (!buf || len < n))
840 {
841 if ( buf && len + 3 < n )
842 {
843 unsigned char on = *opsz;
844 *buf++ = L'\\';
845 *buf++ = (wchar_t)( L'0' + on / 0100 );
846 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
847 *buf++ = (wchar_t)( L'0' + on % 010 );
848 }
849
850 opsz++;
851 len += 4;
852 }
853 }
854 else // MAP_INVALID_UTF8_NOT
855 {
856 return wxCONV_FAILED;
857 }
858 }
859 }
860 }
861
862 if (buf && (len < n))
863 *buf = 0;
864
865 return len;
866 }
867
868 static inline bool isoctal(wchar_t wch)
869 {
870 return L'0' <= wch && wch <= L'7';
871 }
872
873 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
874 {
875 size_t len = 0;
876
877 while (*psz && ((!buf) || (len < n)))
878 {
879 wxUint32 cc;
880
881 #ifdef WC_UTF16
882 // cast is ok for WC_UTF16
883 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
884 psz += (pa == wxCONV_FAILED) ? 1 : pa;
885 #else
886 cc = (*psz++) & 0x7fffffff;
887 #endif
888
889 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
890 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
891 {
892 if (buf)
893 *buf++ = (char)(cc - wxUnicodePUA);
894 len++;
895 }
896 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
897 && cc == L'\\' && psz[0] == L'\\' )
898 {
899 if (buf)
900 *buf++ = (char)cc;
901 psz++;
902 len++;
903 }
904 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
905 cc == L'\\' &&
906 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
907 {
908 if (buf)
909 {
910 *buf++ = (char) ((psz[0] - L'0') * 0100 +
911 (psz[1] - L'0') * 010 +
912 (psz[2] - L'0'));
913 }
914
915 psz += 3;
916 len++;
917 }
918 else
919 {
920 unsigned cnt;
921 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
922 {
923 }
924
925 if (!cnt)
926 {
927 // plain ASCII char
928 if (buf)
929 *buf++ = (char) cc;
930 len++;
931 }
932 else
933 {
934 len += cnt + 1;
935 if (buf)
936 {
937 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
938 while (cnt--)
939 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
940 }
941 }
942 }
943 }
944
945 if (buf && (len < n))
946 *buf = 0;
947
948 return len;
949 }
950
951 // ============================================================================
952 // UTF-16
953 // ============================================================================
954
955 #ifdef WORDS_BIGENDIAN
956 #define wxMBConvUTF16straight wxMBConvUTF16BE
957 #define wxMBConvUTF16swap wxMBConvUTF16LE
958 #else
959 #define wxMBConvUTF16swap wxMBConvUTF16BE
960 #define wxMBConvUTF16straight wxMBConvUTF16LE
961 #endif
962
963 /* static */
964 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
965 {
966 if ( srcLen == wxNO_LEN )
967 {
968 // count the number of bytes in input, including the trailing NULs
969 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
970 for ( srcLen = 1; *inBuff++; srcLen++ )
971 ;
972
973 srcLen *= BYTES_PER_CHAR;
974 }
975 else // we already have the length
976 {
977 // we can only convert an entire number of UTF-16 characters
978 if ( srcLen % BYTES_PER_CHAR )
979 return wxCONV_FAILED;
980 }
981
982 return srcLen;
983 }
984
985 // case when in-memory representation is UTF-16 too
986 #ifdef WC_UTF16
987
988 // ----------------------------------------------------------------------------
989 // conversions without endianness change
990 // ----------------------------------------------------------------------------
991
992 size_t
993 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
994 const char *src, size_t srcLen) const
995 {
996 // set up the scene for using memcpy() (which is presumably more efficient
997 // than copying the bytes one by one)
998 srcLen = GetLength(src, srcLen);
999 if ( srcLen == wxNO_LEN )
1000 return wxCONV_FAILED;
1001
1002 const size_t inLen = srcLen / BYTES_PER_CHAR;
1003 if ( dst )
1004 {
1005 if ( dstLen < inLen )
1006 return wxCONV_FAILED;
1007
1008 memcpy(dst, src, srcLen);
1009 }
1010
1011 return inLen;
1012 }
1013
1014 size_t
1015 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1016 const wchar_t *src, size_t srcLen) const
1017 {
1018 if ( srcLen == wxNO_LEN )
1019 srcLen = wxWcslen(src) + 1;
1020
1021 srcLen *= BYTES_PER_CHAR;
1022
1023 if ( dst )
1024 {
1025 if ( dstLen < srcLen )
1026 return wxCONV_FAILED;
1027
1028 memcpy(dst, src, srcLen);
1029 }
1030
1031 return srcLen;
1032 }
1033
1034 // ----------------------------------------------------------------------------
1035 // endian-reversing conversions
1036 // ----------------------------------------------------------------------------
1037
1038 size_t
1039 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1040 const char *src, size_t srcLen) const
1041 {
1042 srcLen = GetLength(src, srcLen);
1043 if ( srcLen == wxNO_LEN )
1044 return wxCONV_FAILED;
1045
1046 srcLen /= BYTES_PER_CHAR;
1047
1048 if ( dst )
1049 {
1050 if ( dstLen < srcLen )
1051 return wxCONV_FAILED;
1052
1053 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1054 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1055 {
1056 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1057 }
1058 }
1059
1060 return srcLen;
1061 }
1062
1063 size_t
1064 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1065 const wchar_t *src, size_t srcLen) const
1066 {
1067 if ( srcLen == wxNO_LEN )
1068 srcLen = wxWcslen(src) + 1;
1069
1070 srcLen *= BYTES_PER_CHAR;
1071
1072 if ( dst )
1073 {
1074 if ( dstLen < srcLen )
1075 return wxCONV_FAILED;
1076
1077 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1078 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1079 {
1080 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1081 }
1082 }
1083
1084 return srcLen;
1085 }
1086
1087 #else // !WC_UTF16: wchar_t is UTF-32
1088
1089 // ----------------------------------------------------------------------------
1090 // conversions without endianness change
1091 // ----------------------------------------------------------------------------
1092
1093 size_t
1094 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1095 const char *src, size_t srcLen) const
1096 {
1097 srcLen = GetLength(src, srcLen);
1098 if ( srcLen == wxNO_LEN )
1099 return wxCONV_FAILED;
1100
1101 const size_t inLen = srcLen / BYTES_PER_CHAR;
1102 if ( !dst )
1103 {
1104 // optimization: return maximal space which could be needed for this
1105 // string even if the real size could be smaller if the buffer contains
1106 // any surrogates
1107 return inLen;
1108 }
1109
1110 size_t outLen = 0;
1111 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1112 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1113 {
1114 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1115 if ( !inBuff )
1116 return wxCONV_FAILED;
1117
1118 if ( ++outLen > dstLen )
1119 return wxCONV_FAILED;
1120
1121 *dst++ = ch;
1122 }
1123
1124
1125 return outLen;
1126 }
1127
1128 size_t
1129 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1130 const wchar_t *src, size_t srcLen) const
1131 {
1132 if ( srcLen == wxNO_LEN )
1133 srcLen = wxWcslen(src) + 1;
1134
1135 size_t outLen = 0;
1136 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1137 for ( size_t n = 0; n < srcLen; n++ )
1138 {
1139 wxUint16 cc[2];
1140 const size_t numChars = encode_utf16(*src++, cc);
1141 if ( numChars == wxCONV_FAILED )
1142 return wxCONV_FAILED;
1143
1144 outLen += numChars * BYTES_PER_CHAR;
1145 if ( outBuff )
1146 {
1147 if ( outLen > dstLen )
1148 return wxCONV_FAILED;
1149
1150 *outBuff++ = cc[0];
1151 if ( numChars == 2 )
1152 {
1153 // second character of a surrogate
1154 *outBuff++ = cc[1];
1155 }
1156 }
1157 }
1158
1159 return outLen;
1160 }
1161
1162 // ----------------------------------------------------------------------------
1163 // endian-reversing conversions
1164 // ----------------------------------------------------------------------------
1165
1166 size_t
1167 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1168 const char *src, size_t srcLen) const
1169 {
1170 srcLen = GetLength(src, srcLen);
1171 if ( srcLen == wxNO_LEN )
1172 return wxCONV_FAILED;
1173
1174 const size_t inLen = srcLen / BYTES_PER_CHAR;
1175 if ( !dst )
1176 {
1177 // optimization: return maximal space which could be needed for this
1178 // string even if the real size could be smaller if the buffer contains
1179 // any surrogates
1180 return inLen;
1181 }
1182
1183 size_t outLen = 0;
1184 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1185 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1186 {
1187 wxUint32 ch;
1188 wxUint16 tmp[2];
1189
1190 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1191 inBuff++;
1192 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1193
1194 const size_t numChars = decode_utf16(tmp, ch);
1195 if ( numChars == wxCONV_FAILED )
1196 return wxCONV_FAILED;
1197
1198 if ( numChars == 2 )
1199 inBuff++;
1200
1201 if ( ++outLen > dstLen )
1202 return wxCONV_FAILED;
1203
1204 *dst++ = ch;
1205 }
1206
1207
1208 return outLen;
1209 }
1210
1211 size_t
1212 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1213 const wchar_t *src, size_t srcLen) const
1214 {
1215 if ( srcLen == wxNO_LEN )
1216 srcLen = wxWcslen(src) + 1;
1217
1218 size_t outLen = 0;
1219 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1220 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1221 {
1222 wxUint16 cc[2];
1223 const size_t numChars = encode_utf16(*src, cc);
1224 if ( numChars == wxCONV_FAILED )
1225 return wxCONV_FAILED;
1226
1227 outLen += numChars * BYTES_PER_CHAR;
1228 if ( outBuff )
1229 {
1230 if ( outLen > dstLen )
1231 return wxCONV_FAILED;
1232
1233 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1234 if ( numChars == 2 )
1235 {
1236 // second character of a surrogate
1237 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1238 }
1239 }
1240 }
1241
1242 return outLen;
1243 }
1244
1245 #endif // WC_UTF16/!WC_UTF16
1246
1247
1248 // ============================================================================
1249 // UTF-32
1250 // ============================================================================
1251
1252 #ifdef WORDS_BIGENDIAN
1253 #define wxMBConvUTF32straight wxMBConvUTF32BE
1254 #define wxMBConvUTF32swap wxMBConvUTF32LE
1255 #else
1256 #define wxMBConvUTF32swap wxMBConvUTF32BE
1257 #define wxMBConvUTF32straight wxMBConvUTF32LE
1258 #endif
1259
1260
1261 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1262 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1263
1264 /* static */
1265 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1266 {
1267 if ( srcLen == wxNO_LEN )
1268 {
1269 // count the number of bytes in input, including the trailing NULs
1270 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1271 for ( srcLen = 1; *inBuff++; srcLen++ )
1272 ;
1273
1274 srcLen *= BYTES_PER_CHAR;
1275 }
1276 else // we already have the length
1277 {
1278 // we can only convert an entire number of UTF-32 characters
1279 if ( srcLen % BYTES_PER_CHAR )
1280 return wxCONV_FAILED;
1281 }
1282
1283 return srcLen;
1284 }
1285
1286 // case when in-memory representation is UTF-16
1287 #ifdef WC_UTF16
1288
1289 // ----------------------------------------------------------------------------
1290 // conversions without endianness change
1291 // ----------------------------------------------------------------------------
1292
1293 size_t
1294 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1295 const char *src, size_t srcLen) const
1296 {
1297 srcLen = GetLength(src, srcLen);
1298 if ( srcLen == wxNO_LEN )
1299 return wxCONV_FAILED;
1300
1301 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1302 const size_t inLen = srcLen / BYTES_PER_CHAR;
1303 size_t outLen = 0;
1304 for ( size_t n = 0; n < inLen; n++ )
1305 {
1306 wxUint16 cc[2];
1307 const size_t numChars = encode_utf16(*inBuff++, cc);
1308 if ( numChars == wxCONV_FAILED )
1309 return wxCONV_FAILED;
1310
1311 outLen += numChars;
1312 if ( dst )
1313 {
1314 if ( outLen > dstLen )
1315 return wxCONV_FAILED;
1316
1317 *dst++ = cc[0];
1318 if ( numChars == 2 )
1319 {
1320 // second character of a surrogate
1321 *dst++ = cc[1];
1322 }
1323 }
1324 }
1325
1326 return outLen;
1327 }
1328
1329 size_t
1330 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1331 const wchar_t *src, size_t srcLen) const
1332 {
1333 if ( srcLen == wxNO_LEN )
1334 srcLen = wxWcslen(src) + 1;
1335
1336 if ( !dst )
1337 {
1338 // optimization: return maximal space which could be needed for this
1339 // string instead of the exact amount which could be less if there are
1340 // any surrogates in the input
1341 //
1342 // we consider that surrogates are rare enough to make it worthwhile to
1343 // avoid running the loop below at the cost of slightly extra memory
1344 // consumption
1345 return srcLen * BYTES_PER_CHAR;
1346 }
1347
1348 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1349 size_t outLen = 0;
1350 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1351 {
1352 const wxUint32 ch = wxDecodeSurrogate(&src);
1353 if ( !src )
1354 return wxCONV_FAILED;
1355
1356 outLen += BYTES_PER_CHAR;
1357
1358 if ( outLen > dstLen )
1359 return wxCONV_FAILED;
1360
1361 *outBuff++ = ch;
1362 }
1363
1364 return outLen;
1365 }
1366
1367 // ----------------------------------------------------------------------------
1368 // endian-reversing conversions
1369 // ----------------------------------------------------------------------------
1370
1371 size_t
1372 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1373 const char *src, size_t srcLen) const
1374 {
1375 srcLen = GetLength(src, srcLen);
1376 if ( srcLen == wxNO_LEN )
1377 return wxCONV_FAILED;
1378
1379 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1380 const size_t inLen = srcLen / BYTES_PER_CHAR;
1381 size_t outLen = 0;
1382 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1383 {
1384 wxUint16 cc[2];
1385 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1386 if ( numChars == wxCONV_FAILED )
1387 return wxCONV_FAILED;
1388
1389 outLen += numChars;
1390 if ( dst )
1391 {
1392 if ( outLen > dstLen )
1393 return wxCONV_FAILED;
1394
1395 *dst++ = cc[0];
1396 if ( numChars == 2 )
1397 {
1398 // second character of a surrogate
1399 *dst++ = cc[1];
1400 }
1401 }
1402 }
1403
1404 return outLen;
1405 }
1406
1407 size_t
1408 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1409 const wchar_t *src, size_t srcLen) const
1410 {
1411 if ( srcLen == wxNO_LEN )
1412 srcLen = wxWcslen(src) + 1;
1413
1414 if ( !dst )
1415 {
1416 // optimization: return maximal space which could be needed for this
1417 // string instead of the exact amount which could be less if there are
1418 // any surrogates in the input
1419 //
1420 // we consider that surrogates are rare enough to make it worthwhile to
1421 // avoid running the loop below at the cost of slightly extra memory
1422 // consumption
1423 return srcLen*BYTES_PER_CHAR;
1424 }
1425
1426 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1427 size_t outLen = 0;
1428 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1429 {
1430 const wxUint32 ch = wxDecodeSurrogate(&src);
1431 if ( !src )
1432 return wxCONV_FAILED;
1433
1434 outLen += BYTES_PER_CHAR;
1435
1436 if ( outLen > dstLen )
1437 return wxCONV_FAILED;
1438
1439 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1440 }
1441
1442 return outLen;
1443 }
1444
1445 #else // !WC_UTF16: wchar_t is UTF-32
1446
1447 // ----------------------------------------------------------------------------
1448 // conversions without endianness change
1449 // ----------------------------------------------------------------------------
1450
1451 size_t
1452 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1453 const char *src, size_t srcLen) const
1454 {
1455 // use memcpy() as it should be much faster than hand-written loop
1456 srcLen = GetLength(src, srcLen);
1457 if ( srcLen == wxNO_LEN )
1458 return wxCONV_FAILED;
1459
1460 const size_t inLen = srcLen/BYTES_PER_CHAR;
1461 if ( dst )
1462 {
1463 if ( dstLen < inLen )
1464 return wxCONV_FAILED;
1465
1466 memcpy(dst, src, srcLen);
1467 }
1468
1469 return inLen;
1470 }
1471
1472 size_t
1473 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1474 const wchar_t *src, size_t srcLen) const
1475 {
1476 if ( srcLen == wxNO_LEN )
1477 srcLen = wxWcslen(src) + 1;
1478
1479 srcLen *= BYTES_PER_CHAR;
1480
1481 if ( dst )
1482 {
1483 if ( dstLen < srcLen )
1484 return wxCONV_FAILED;
1485
1486 memcpy(dst, src, srcLen);
1487 }
1488
1489 return srcLen;
1490 }
1491
1492 // ----------------------------------------------------------------------------
1493 // endian-reversing conversions
1494 // ----------------------------------------------------------------------------
1495
1496 size_t
1497 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1498 const char *src, size_t srcLen) const
1499 {
1500 srcLen = GetLength(src, srcLen);
1501 if ( srcLen == wxNO_LEN )
1502 return wxCONV_FAILED;
1503
1504 srcLen /= BYTES_PER_CHAR;
1505
1506 if ( dst )
1507 {
1508 if ( dstLen < srcLen )
1509 return wxCONV_FAILED;
1510
1511 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1512 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1513 {
1514 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1515 }
1516 }
1517
1518 return srcLen;
1519 }
1520
1521 size_t
1522 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1523 const wchar_t *src, size_t srcLen) const
1524 {
1525 if ( srcLen == wxNO_LEN )
1526 srcLen = wxWcslen(src) + 1;
1527
1528 srcLen *= BYTES_PER_CHAR;
1529
1530 if ( dst )
1531 {
1532 if ( dstLen < srcLen )
1533 return wxCONV_FAILED;
1534
1535 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1536 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1537 {
1538 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1539 }
1540 }
1541
1542 return srcLen;
1543 }
1544
1545 #endif // WC_UTF16/!WC_UTF16
1546
1547
1548 // ============================================================================
1549 // The classes doing conversion using the iconv_xxx() functions
1550 // ============================================================================
1551
1552 #ifdef HAVE_ICONV
1553
1554 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1555 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1556 // (unless there's yet another bug in glibc) the only case when iconv()
1557 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1558 // left in the input buffer -- when _real_ error occurs,
1559 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1560 // iconv() failure.
1561 // [This bug does not appear in glibc 2.2.]
1562 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1563 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1564 (errno != E2BIG || bufLeft != 0))
1565 #else
1566 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1567 #endif
1568
1569 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1570
1571 #define ICONV_T_INVALID ((iconv_t)-1)
1572
1573 #if SIZEOF_WCHAR_T == 4
1574 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1575 #define WC_ENC wxFONTENCODING_UTF32
1576 #elif SIZEOF_WCHAR_T == 2
1577 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1578 #define WC_ENC wxFONTENCODING_UTF16
1579 #else // sizeof(wchar_t) != 2 nor 4
1580 // does this ever happen?
1581 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1582 #endif
1583
1584 // ----------------------------------------------------------------------------
1585 // wxMBConv_iconv: encapsulates an iconv character set
1586 // ----------------------------------------------------------------------------
1587
1588 class wxMBConv_iconv : public wxMBConv
1589 {
1590 public:
1591 wxMBConv_iconv(const wxChar *name);
1592 virtual ~wxMBConv_iconv();
1593
1594 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1595 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1596
1597 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1598 virtual size_t GetMBNulLen() const;
1599
1600 #if wxUSE_UNICODE_UTF8
1601 virtual bool IsUTF8() const;
1602 #endif
1603
1604 virtual wxMBConv *Clone() const
1605 {
1606 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1607 p->m_minMBCharWidth = m_minMBCharWidth;
1608 return p;
1609 }
1610
1611 bool IsOk() const
1612 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1613
1614 protected:
1615 // the iconv handlers used to translate from multibyte
1616 // to wide char and in the other direction
1617 iconv_t m2w,
1618 w2m;
1619
1620 #if wxUSE_THREADS
1621 // guards access to m2w and w2m objects
1622 wxMutex m_iconvMutex;
1623 #endif
1624
1625 private:
1626 // the name (for iconv_open()) of a wide char charset -- if none is
1627 // available on this machine, it will remain NULL
1628 static wxString ms_wcCharsetName;
1629
1630 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1631 // different endian-ness than the native one
1632 static bool ms_wcNeedsSwap;
1633
1634
1635 // name of the encoding handled by this conversion
1636 wxString m_name;
1637
1638 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1639 // initially
1640 size_t m_minMBCharWidth;
1641 };
1642
1643 // make the constructor available for unit testing
1644 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1645 {
1646 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1647 if ( !result->IsOk() )
1648 {
1649 delete result;
1650 return 0;
1651 }
1652
1653 return result;
1654 }
1655
1656 wxString wxMBConv_iconv::ms_wcCharsetName;
1657 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1658
1659 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1660 : m_name(name)
1661 {
1662 m_minMBCharWidth = 0;
1663
1664 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1665 // names for the charsets
1666 const wxCharBuffer cname(wxString(name).ToAscii());
1667
1668 // check for charset that represents wchar_t:
1669 if ( ms_wcCharsetName.empty() )
1670 {
1671 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1672
1673 #if wxUSE_FONTMAP
1674 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1675 #else // !wxUSE_FONTMAP
1676 static const wxChar *names_static[] =
1677 {
1678 #if SIZEOF_WCHAR_T == 4
1679 _T("UCS-4"),
1680 #elif SIZEOF_WCHAR_T = 2
1681 _T("UCS-2"),
1682 #endif
1683 NULL
1684 };
1685 const wxChar **names = names_static;
1686 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1687
1688 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1689 {
1690 const wxString nameCS(*names);
1691
1692 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1693 wxString nameXE(nameCS);
1694
1695 #ifdef WORDS_BIGENDIAN
1696 nameXE += _T("BE");
1697 #else // little endian
1698 nameXE += _T("LE");
1699 #endif
1700
1701 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1702 nameXE.c_str());
1703
1704 m2w = iconv_open(nameXE.ToAscii(), cname);
1705 if ( m2w == ICONV_T_INVALID )
1706 {
1707 // try charset w/o bytesex info (e.g. "UCS4")
1708 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1709 nameCS.c_str());
1710 m2w = iconv_open(nameCS.ToAscii(), cname);
1711
1712 // and check for bytesex ourselves:
1713 if ( m2w != ICONV_T_INVALID )
1714 {
1715 char buf[2], *bufPtr;
1716 wchar_t wbuf[2], *wbufPtr;
1717 size_t insz, outsz;
1718 size_t res;
1719
1720 buf[0] = 'A';
1721 buf[1] = 0;
1722 wbuf[0] = 0;
1723 insz = 2;
1724 outsz = SIZEOF_WCHAR_T * 2;
1725 wbufPtr = wbuf;
1726 bufPtr = buf;
1727
1728 res = iconv(
1729 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1730 (char**)&wbufPtr, &outsz);
1731
1732 if (ICONV_FAILED(res, insz))
1733 {
1734 wxLogLastError(wxT("iconv"));
1735 wxLogError(_("Conversion to charset '%s' doesn't work."),
1736 nameCS.c_str());
1737 }
1738 else // ok, can convert to this encoding, remember it
1739 {
1740 ms_wcCharsetName = nameCS;
1741 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1742 }
1743 }
1744 }
1745 else // use charset not requiring byte swapping
1746 {
1747 ms_wcCharsetName = nameXE;
1748 }
1749 }
1750
1751 wxLogTrace(TRACE_STRCONV,
1752 wxT("iconv wchar_t charset is \"%s\"%s"),
1753 ms_wcCharsetName.empty() ? _T("<none>")
1754 : ms_wcCharsetName.c_str(),
1755 ms_wcNeedsSwap ? _T(" (needs swap)")
1756 : _T(""));
1757 }
1758 else // we already have ms_wcCharsetName
1759 {
1760 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1761 }
1762
1763 if ( ms_wcCharsetName.empty() )
1764 {
1765 w2m = ICONV_T_INVALID;
1766 }
1767 else
1768 {
1769 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1770 if ( w2m == ICONV_T_INVALID )
1771 {
1772 wxLogTrace(TRACE_STRCONV,
1773 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1774 ms_wcCharsetName.c_str(), cname.data());
1775 }
1776 }
1777 }
1778
1779 wxMBConv_iconv::~wxMBConv_iconv()
1780 {
1781 if ( m2w != ICONV_T_INVALID )
1782 iconv_close(m2w);
1783 if ( w2m != ICONV_T_INVALID )
1784 iconv_close(w2m);
1785 }
1786
1787 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1788 {
1789 // find the string length: notice that must be done differently for
1790 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1791 size_t inbuf;
1792 const size_t nulLen = GetMBNulLen();
1793 switch ( nulLen )
1794 {
1795 default:
1796 return wxCONV_FAILED;
1797
1798 case 1:
1799 inbuf = strlen(psz); // arguably more optimized than our version
1800 break;
1801
1802 case 2:
1803 case 4:
1804 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1805 // they also have to start at character boundary and not span two
1806 // adjacent characters
1807 const char *p;
1808 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1809 ;
1810 inbuf = p - psz;
1811 break;
1812 }
1813
1814 #if wxUSE_THREADS
1815 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1816 // Unfortunately there are a couple of global wxCSConv objects such as
1817 // wxConvLocal that are used all over wx code, so we have to make sure
1818 // the handle is used by at most one thread at the time. Otherwise
1819 // only a few wx classes would be safe to use from non-main threads
1820 // as MB<->WC conversion would fail "randomly".
1821 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1822 #endif // wxUSE_THREADS
1823
1824 size_t outbuf = n * SIZEOF_WCHAR_T;
1825 size_t res, cres;
1826 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1827 wchar_t *bufPtr = buf;
1828 const char *pszPtr = psz;
1829
1830 if (buf)
1831 {
1832 // have destination buffer, convert there
1833 cres = iconv(m2w,
1834 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1835 (char**)&bufPtr, &outbuf);
1836 res = n - (outbuf / SIZEOF_WCHAR_T);
1837
1838 if (ms_wcNeedsSwap)
1839 {
1840 // convert to native endianness
1841 for ( unsigned i = 0; i < res; i++ )
1842 buf[n] = WC_BSWAP(buf[i]);
1843 }
1844
1845 // NUL-terminate the string if there is any space left
1846 if (res < n)
1847 buf[res] = 0;
1848 }
1849 else
1850 {
1851 // no destination buffer... convert using temp buffer
1852 // to calculate destination buffer requirement
1853 wchar_t tbuf[8];
1854 res = 0;
1855
1856 do
1857 {
1858 bufPtr = tbuf;
1859 outbuf = 8 * SIZEOF_WCHAR_T;
1860
1861 cres = iconv(m2w,
1862 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1863 (char**)&bufPtr, &outbuf );
1864
1865 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1866 }
1867 while ((cres == (size_t)-1) && (errno == E2BIG));
1868 }
1869
1870 if (ICONV_FAILED(cres, inbuf))
1871 {
1872 //VS: it is ok if iconv fails, hence trace only
1873 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1874 return wxCONV_FAILED;
1875 }
1876
1877 return res;
1878 }
1879
1880 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1881 {
1882 #if wxUSE_THREADS
1883 // NB: explained in MB2WC
1884 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1885 #endif
1886
1887 size_t inlen = wxWcslen(psz);
1888 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1889 size_t outbuf = n;
1890 size_t res, cres;
1891
1892 wchar_t *tmpbuf = 0;
1893
1894 if (ms_wcNeedsSwap)
1895 {
1896 // need to copy to temp buffer to switch endianness
1897 // (doing WC_BSWAP twice on the original buffer won't help, as it
1898 // could be in read-only memory, or be accessed in some other thread)
1899 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1900 for ( size_t i = 0; i < inlen; i++ )
1901 tmpbuf[n] = WC_BSWAP(psz[i]);
1902
1903 tmpbuf[inlen] = L'\0';
1904 psz = tmpbuf;
1905 }
1906
1907 if (buf)
1908 {
1909 // have destination buffer, convert there
1910 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1911
1912 res = n - outbuf;
1913
1914 // NB: iconv was given only wcslen(psz) characters on input, and so
1915 // it couldn't convert the trailing zero. Let's do it ourselves
1916 // if there's some room left for it in the output buffer.
1917 if (res < n)
1918 buf[0] = 0;
1919 }
1920 else
1921 {
1922 // no destination buffer: convert using temp buffer
1923 // to calculate destination buffer requirement
1924 char tbuf[16];
1925 res = 0;
1926 do
1927 {
1928 buf = tbuf;
1929 outbuf = 16;
1930
1931 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1932
1933 res += 16 - outbuf;
1934 }
1935 while ((cres == (size_t)-1) && (errno == E2BIG));
1936 }
1937
1938 if (ms_wcNeedsSwap)
1939 {
1940 free(tmpbuf);
1941 }
1942
1943 if (ICONV_FAILED(cres, inbuf))
1944 {
1945 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1946 return wxCONV_FAILED;
1947 }
1948
1949 return res;
1950 }
1951
1952 size_t wxMBConv_iconv::GetMBNulLen() const
1953 {
1954 if ( m_minMBCharWidth == 0 )
1955 {
1956 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1957
1958 #if wxUSE_THREADS
1959 // NB: explained in MB2WC
1960 wxMutexLocker lock(self->m_iconvMutex);
1961 #endif
1962
1963 wchar_t *wnul = L"";
1964 char buf[8]; // should be enough for NUL in any encoding
1965 size_t inLen = sizeof(wchar_t),
1966 outLen = WXSIZEOF(buf);
1967 char *inBuff = (char *)wnul;
1968 char *outBuff = buf;
1969 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1970 {
1971 self->m_minMBCharWidth = (size_t)-1;
1972 }
1973 else // ok
1974 {
1975 self->m_minMBCharWidth = outBuff - buf;
1976 }
1977 }
1978
1979 return m_minMBCharWidth;
1980 }
1981
1982 #if wxUSE_UNICODE_UTF8
1983 bool wxMBConv_iconv::IsUTF8() const
1984 {
1985 return wxStricmp(m_name, "UTF-8") == 0 ||
1986 wxStricmp(m_name, "UTF8") == 0;
1987 }
1988 #endif
1989
1990 #endif // HAVE_ICONV
1991
1992
1993 // ============================================================================
1994 // Win32 conversion classes
1995 // ============================================================================
1996
1997 #ifdef wxHAVE_WIN32_MB2WC
1998
1999 // from utils.cpp
2000 #if wxUSE_FONTMAP
2001 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
2002 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2003 #endif
2004
2005 class wxMBConv_win32 : public wxMBConv
2006 {
2007 public:
2008 wxMBConv_win32()
2009 {
2010 m_CodePage = CP_ACP;
2011 m_minMBCharWidth = 0;
2012 }
2013
2014 wxMBConv_win32(const wxMBConv_win32& conv)
2015 : wxMBConv()
2016 {
2017 m_CodePage = conv.m_CodePage;
2018 m_minMBCharWidth = conv.m_minMBCharWidth;
2019 }
2020
2021 #if wxUSE_FONTMAP
2022 wxMBConv_win32(const wxChar* name)
2023 {
2024 m_CodePage = wxCharsetToCodepage(name);
2025 m_minMBCharWidth = 0;
2026 }
2027
2028 wxMBConv_win32(wxFontEncoding encoding)
2029 {
2030 m_CodePage = wxEncodingToCodepage(encoding);
2031 m_minMBCharWidth = 0;
2032 }
2033 #endif // wxUSE_FONTMAP
2034
2035 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2036 {
2037 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2038 // the behaviour is not compatible with the Unix version (using iconv)
2039 // and break the library itself, e.g. wxTextInputStream::NextChar()
2040 // wouldn't work if reading an incomplete MB char didn't result in an
2041 // error
2042 //
2043 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2044 // Win XP or newer and it is not supported for UTF-[78] so we always
2045 // use our own conversions in this case. See
2046 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2047 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2048 if ( m_CodePage == CP_UTF8 )
2049 {
2050 return wxMBConvUTF8().MB2WC(buf, psz, n);
2051 }
2052
2053 if ( m_CodePage == CP_UTF7 )
2054 {
2055 return wxMBConvUTF7().MB2WC(buf, psz, n);
2056 }
2057
2058 int flags = 0;
2059 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2060 IsAtLeastWin2kSP4() )
2061 {
2062 flags = MB_ERR_INVALID_CHARS;
2063 }
2064
2065 const size_t len = ::MultiByteToWideChar
2066 (
2067 m_CodePage, // code page
2068 flags, // flags: fall on error
2069 psz, // input string
2070 -1, // its length (NUL-terminated)
2071 buf, // output string
2072 buf ? n : 0 // size of output buffer
2073 );
2074 if ( !len )
2075 {
2076 // function totally failed
2077 return wxCONV_FAILED;
2078 }
2079
2080 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2081 // check if we succeeded, by doing a double trip:
2082 if ( !flags && buf )
2083 {
2084 const size_t mbLen = strlen(psz);
2085 wxCharBuffer mbBuf(mbLen);
2086 if ( ::WideCharToMultiByte
2087 (
2088 m_CodePage,
2089 0,
2090 buf,
2091 -1,
2092 mbBuf.data(),
2093 mbLen + 1, // size in bytes, not length
2094 NULL,
2095 NULL
2096 ) == 0 ||
2097 strcmp(mbBuf, psz) != 0 )
2098 {
2099 // we didn't obtain the same thing we started from, hence
2100 // the conversion was lossy and we consider that it failed
2101 return wxCONV_FAILED;
2102 }
2103 }
2104
2105 // note that it returns count of written chars for buf != NULL and size
2106 // of the needed buffer for buf == NULL so in either case the length of
2107 // the string (which never includes the terminating NUL) is one less
2108 return len - 1;
2109 }
2110
2111 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2112 {
2113 /*
2114 we have a problem here: by default, WideCharToMultiByte() may
2115 replace characters unrepresentable in the target code page with bad
2116 quality approximations such as turning "1/2" symbol (U+00BD) into
2117 "1" for the code pages which don't have it and we, obviously, want
2118 to avoid this at any price
2119
2120 the trouble is that this function does it _silently_, i.e. it won't
2121 even tell us whether it did or not... Win98/2000 and higher provide
2122 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2123 we have to resort to a round trip, i.e. check that converting back
2124 results in the same string -- this is, of course, expensive but
2125 otherwise we simply can't be sure to not garble the data.
2126 */
2127
2128 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2129 // it doesn't work with CJK encodings (which we test for rather roughly
2130 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2131 // supporting it
2132 BOOL usedDef wxDUMMY_INITIALIZE(false);
2133 BOOL *pUsedDef;
2134 int flags;
2135 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2136 {
2137 // it's our lucky day
2138 flags = WC_NO_BEST_FIT_CHARS;
2139 pUsedDef = &usedDef;
2140 }
2141 else // old system or unsupported encoding
2142 {
2143 flags = 0;
2144 pUsedDef = NULL;
2145 }
2146
2147 const size_t len = ::WideCharToMultiByte
2148 (
2149 m_CodePage, // code page
2150 flags, // either none or no best fit
2151 pwz, // input string
2152 -1, // it is (wide) NUL-terminated
2153 buf, // output buffer
2154 buf ? n : 0, // and its size
2155 NULL, // default "replacement" char
2156 pUsedDef // [out] was it used?
2157 );
2158
2159 if ( !len )
2160 {
2161 // function totally failed
2162 return wxCONV_FAILED;
2163 }
2164
2165 // if we were really converting, check if we succeeded
2166 if ( buf )
2167 {
2168 if ( flags )
2169 {
2170 // check if the conversion failed, i.e. if any replacements
2171 // were done
2172 if ( usedDef )
2173 return wxCONV_FAILED;
2174 }
2175 else // we must resort to double tripping...
2176 {
2177 wxWCharBuffer wcBuf(n);
2178 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2179 wcscmp(wcBuf, pwz) != 0 )
2180 {
2181 // we didn't obtain the same thing we started from, hence
2182 // the conversion was lossy and we consider that it failed
2183 return wxCONV_FAILED;
2184 }
2185 }
2186 }
2187
2188 // see the comment above for the reason of "len - 1"
2189 return len - 1;
2190 }
2191
2192 virtual size_t GetMBNulLen() const
2193 {
2194 if ( m_minMBCharWidth == 0 )
2195 {
2196 int len = ::WideCharToMultiByte
2197 (
2198 m_CodePage, // code page
2199 0, // no flags
2200 L"", // input string
2201 1, // translate just the NUL
2202 NULL, // output buffer
2203 0, // and its size
2204 NULL, // no replacement char
2205 NULL // [out] don't care if it was used
2206 );
2207
2208 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2209 switch ( len )
2210 {
2211 default:
2212 wxLogDebug(_T("Unexpected NUL length %d"), len);
2213 self->m_minMBCharWidth = (size_t)-1;
2214 break;
2215
2216 case 0:
2217 self->m_minMBCharWidth = (size_t)-1;
2218 break;
2219
2220 case 1:
2221 case 2:
2222 case 4:
2223 self->m_minMBCharWidth = len;
2224 break;
2225 }
2226 }
2227
2228 return m_minMBCharWidth;
2229 }
2230
2231 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2232
2233 bool IsOk() const { return m_CodePage != -1; }
2234
2235 private:
2236 static bool CanUseNoBestFit()
2237 {
2238 static int s_isWin98Or2k = -1;
2239
2240 if ( s_isWin98Or2k == -1 )
2241 {
2242 int verMaj, verMin;
2243 switch ( wxGetOsVersion(&verMaj, &verMin) )
2244 {
2245 case wxOS_WINDOWS_9X:
2246 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2247 break;
2248
2249 case wxOS_WINDOWS_NT:
2250 s_isWin98Or2k = verMaj >= 5;
2251 break;
2252
2253 default:
2254 // unknown: be conservative by default
2255 s_isWin98Or2k = 0;
2256 break;
2257 }
2258
2259 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2260 }
2261
2262 return s_isWin98Or2k == 1;
2263 }
2264
2265 static bool IsAtLeastWin2kSP4()
2266 {
2267 #ifdef __WXWINCE__
2268 return false;
2269 #else
2270 static int s_isAtLeastWin2kSP4 = -1;
2271
2272 if ( s_isAtLeastWin2kSP4 == -1 )
2273 {
2274 OSVERSIONINFOEX ver;
2275
2276 memset(&ver, 0, sizeof(ver));
2277 ver.dwOSVersionInfoSize = sizeof(ver);
2278 GetVersionEx((OSVERSIONINFO*)&ver);
2279
2280 s_isAtLeastWin2kSP4 =
2281 ((ver.dwMajorVersion > 5) || // Vista+
2282 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2283 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2284 ver.wServicePackMajor >= 4)) // 2000 SP4+
2285 ? 1 : 0;
2286 }
2287
2288 return s_isAtLeastWin2kSP4 == 1;
2289 #endif
2290 }
2291
2292
2293 // the code page we're working with
2294 long m_CodePage;
2295
2296 // cached result of GetMBNulLen(), set to 0 initially meaning
2297 // "unknown"
2298 size_t m_minMBCharWidth;
2299 };
2300
2301 #endif // wxHAVE_WIN32_MB2WC
2302
2303 // ============================================================================
2304 // Cocoa conversion classes
2305 // ============================================================================
2306
2307 #if defined(__WXCOCOA__)
2308
2309 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2310 // Strangely enough, internally Core Foundation uses
2311 // UTF-32 internally quite a bit - its just not public (yet).
2312
2313 #include <CoreFoundation/CFString.h>
2314 #include <CoreFoundation/CFStringEncodingExt.h>
2315
2316 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2317 {
2318 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2319
2320 switch (encoding)
2321 {
2322 case wxFONTENCODING_DEFAULT :
2323 enc = CFStringGetSystemEncoding();
2324 break ;
2325
2326 case wxFONTENCODING_ISO8859_1 :
2327 enc = kCFStringEncodingISOLatin1 ;
2328 break ;
2329 case wxFONTENCODING_ISO8859_2 :
2330 enc = kCFStringEncodingISOLatin2;
2331 break ;
2332 case wxFONTENCODING_ISO8859_3 :
2333 enc = kCFStringEncodingISOLatin3 ;
2334 break ;
2335 case wxFONTENCODING_ISO8859_4 :
2336 enc = kCFStringEncodingISOLatin4;
2337 break ;
2338 case wxFONTENCODING_ISO8859_5 :
2339 enc = kCFStringEncodingISOLatinCyrillic;
2340 break ;
2341 case wxFONTENCODING_ISO8859_6 :
2342 enc = kCFStringEncodingISOLatinArabic;
2343 break ;
2344 case wxFONTENCODING_ISO8859_7 :
2345 enc = kCFStringEncodingISOLatinGreek;
2346 break ;
2347 case wxFONTENCODING_ISO8859_8 :
2348 enc = kCFStringEncodingISOLatinHebrew;
2349 break ;
2350 case wxFONTENCODING_ISO8859_9 :
2351 enc = kCFStringEncodingISOLatin5;
2352 break ;
2353 case wxFONTENCODING_ISO8859_10 :
2354 enc = kCFStringEncodingISOLatin6;
2355 break ;
2356 case wxFONTENCODING_ISO8859_11 :
2357 enc = kCFStringEncodingISOLatinThai;
2358 break ;
2359 case wxFONTENCODING_ISO8859_13 :
2360 enc = kCFStringEncodingISOLatin7;
2361 break ;
2362 case wxFONTENCODING_ISO8859_14 :
2363 enc = kCFStringEncodingISOLatin8;
2364 break ;
2365 case wxFONTENCODING_ISO8859_15 :
2366 enc = kCFStringEncodingISOLatin9;
2367 break ;
2368
2369 case wxFONTENCODING_KOI8 :
2370 enc = kCFStringEncodingKOI8_R;
2371 break ;
2372 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2373 enc = kCFStringEncodingDOSRussian;
2374 break ;
2375
2376 // case wxFONTENCODING_BULGARIAN :
2377 // enc = ;
2378 // break ;
2379
2380 case wxFONTENCODING_CP437 :
2381 enc = kCFStringEncodingDOSLatinUS ;
2382 break ;
2383 case wxFONTENCODING_CP850 :
2384 enc = kCFStringEncodingDOSLatin1;
2385 break ;
2386 case wxFONTENCODING_CP852 :
2387 enc = kCFStringEncodingDOSLatin2;
2388 break ;
2389 case wxFONTENCODING_CP855 :
2390 enc = kCFStringEncodingDOSCyrillic;
2391 break ;
2392 case wxFONTENCODING_CP866 :
2393 enc = kCFStringEncodingDOSRussian ;
2394 break ;
2395 case wxFONTENCODING_CP874 :
2396 enc = kCFStringEncodingDOSThai;
2397 break ;
2398 case wxFONTENCODING_CP932 :
2399 enc = kCFStringEncodingDOSJapanese;
2400 break ;
2401 case wxFONTENCODING_CP936 :
2402 enc = kCFStringEncodingDOSChineseSimplif ;
2403 break ;
2404 case wxFONTENCODING_CP949 :
2405 enc = kCFStringEncodingDOSKorean;
2406 break ;
2407 case wxFONTENCODING_CP950 :
2408 enc = kCFStringEncodingDOSChineseTrad;
2409 break ;
2410 case wxFONTENCODING_CP1250 :
2411 enc = kCFStringEncodingWindowsLatin2;
2412 break ;
2413 case wxFONTENCODING_CP1251 :
2414 enc = kCFStringEncodingWindowsCyrillic ;
2415 break ;
2416 case wxFONTENCODING_CP1252 :
2417 enc = kCFStringEncodingWindowsLatin1 ;
2418 break ;
2419 case wxFONTENCODING_CP1253 :
2420 enc = kCFStringEncodingWindowsGreek;
2421 break ;
2422 case wxFONTENCODING_CP1254 :
2423 enc = kCFStringEncodingWindowsLatin5;
2424 break ;
2425 case wxFONTENCODING_CP1255 :
2426 enc = kCFStringEncodingWindowsHebrew ;
2427 break ;
2428 case wxFONTENCODING_CP1256 :
2429 enc = kCFStringEncodingWindowsArabic ;
2430 break ;
2431 case wxFONTENCODING_CP1257 :
2432 enc = kCFStringEncodingWindowsBalticRim;
2433 break ;
2434 // This only really encodes to UTF7 (if that) evidently
2435 // case wxFONTENCODING_UTF7 :
2436 // enc = kCFStringEncodingNonLossyASCII ;
2437 // break ;
2438 case wxFONTENCODING_UTF8 :
2439 enc = kCFStringEncodingUTF8 ;
2440 break ;
2441 case wxFONTENCODING_EUC_JP :
2442 enc = kCFStringEncodingEUC_JP;
2443 break ;
2444 case wxFONTENCODING_UTF16 :
2445 enc = kCFStringEncodingUnicode ;
2446 break ;
2447 case wxFONTENCODING_MACROMAN :
2448 enc = kCFStringEncodingMacRoman ;
2449 break ;
2450 case wxFONTENCODING_MACJAPANESE :
2451 enc = kCFStringEncodingMacJapanese ;
2452 break ;
2453 case wxFONTENCODING_MACCHINESETRAD :
2454 enc = kCFStringEncodingMacChineseTrad ;
2455 break ;
2456 case wxFONTENCODING_MACKOREAN :
2457 enc = kCFStringEncodingMacKorean ;
2458 break ;
2459 case wxFONTENCODING_MACARABIC :
2460 enc = kCFStringEncodingMacArabic ;
2461 break ;
2462 case wxFONTENCODING_MACHEBREW :
2463 enc = kCFStringEncodingMacHebrew ;
2464 break ;
2465 case wxFONTENCODING_MACGREEK :
2466 enc = kCFStringEncodingMacGreek ;
2467 break ;
2468 case wxFONTENCODING_MACCYRILLIC :
2469 enc = kCFStringEncodingMacCyrillic ;
2470 break ;
2471 case wxFONTENCODING_MACDEVANAGARI :
2472 enc = kCFStringEncodingMacDevanagari ;
2473 break ;
2474 case wxFONTENCODING_MACGURMUKHI :
2475 enc = kCFStringEncodingMacGurmukhi ;
2476 break ;
2477 case wxFONTENCODING_MACGUJARATI :
2478 enc = kCFStringEncodingMacGujarati ;
2479 break ;
2480 case wxFONTENCODING_MACORIYA :
2481 enc = kCFStringEncodingMacOriya ;
2482 break ;
2483 case wxFONTENCODING_MACBENGALI :
2484 enc = kCFStringEncodingMacBengali ;
2485 break ;
2486 case wxFONTENCODING_MACTAMIL :
2487 enc = kCFStringEncodingMacTamil ;
2488 break ;
2489 case wxFONTENCODING_MACTELUGU :
2490 enc = kCFStringEncodingMacTelugu ;
2491 break ;
2492 case wxFONTENCODING_MACKANNADA :
2493 enc = kCFStringEncodingMacKannada ;
2494 break ;
2495 case wxFONTENCODING_MACMALAJALAM :
2496 enc = kCFStringEncodingMacMalayalam ;
2497 break ;
2498 case wxFONTENCODING_MACSINHALESE :
2499 enc = kCFStringEncodingMacSinhalese ;
2500 break ;
2501 case wxFONTENCODING_MACBURMESE :
2502 enc = kCFStringEncodingMacBurmese ;
2503 break ;
2504 case wxFONTENCODING_MACKHMER :
2505 enc = kCFStringEncodingMacKhmer ;
2506 break ;
2507 case wxFONTENCODING_MACTHAI :
2508 enc = kCFStringEncodingMacThai ;
2509 break ;
2510 case wxFONTENCODING_MACLAOTIAN :
2511 enc = kCFStringEncodingMacLaotian ;
2512 break ;
2513 case wxFONTENCODING_MACGEORGIAN :
2514 enc = kCFStringEncodingMacGeorgian ;
2515 break ;
2516 case wxFONTENCODING_MACARMENIAN :
2517 enc = kCFStringEncodingMacArmenian ;
2518 break ;
2519 case wxFONTENCODING_MACCHINESESIMP :
2520 enc = kCFStringEncodingMacChineseSimp ;
2521 break ;
2522 case wxFONTENCODING_MACTIBETAN :
2523 enc = kCFStringEncodingMacTibetan ;
2524 break ;
2525 case wxFONTENCODING_MACMONGOLIAN :
2526 enc = kCFStringEncodingMacMongolian ;
2527 break ;
2528 case wxFONTENCODING_MACETHIOPIC :
2529 enc = kCFStringEncodingMacEthiopic ;
2530 break ;
2531 case wxFONTENCODING_MACCENTRALEUR :
2532 enc = kCFStringEncodingMacCentralEurRoman ;
2533 break ;
2534 case wxFONTENCODING_MACVIATNAMESE :
2535 enc = kCFStringEncodingMacVietnamese ;
2536 break ;
2537 case wxFONTENCODING_MACARABICEXT :
2538 enc = kCFStringEncodingMacExtArabic ;
2539 break ;
2540 case wxFONTENCODING_MACSYMBOL :
2541 enc = kCFStringEncodingMacSymbol ;
2542 break ;
2543 case wxFONTENCODING_MACDINGBATS :
2544 enc = kCFStringEncodingMacDingbats ;
2545 break ;
2546 case wxFONTENCODING_MACTURKISH :
2547 enc = kCFStringEncodingMacTurkish ;
2548 break ;
2549 case wxFONTENCODING_MACCROATIAN :
2550 enc = kCFStringEncodingMacCroatian ;
2551 break ;
2552 case wxFONTENCODING_MACICELANDIC :
2553 enc = kCFStringEncodingMacIcelandic ;
2554 break ;
2555 case wxFONTENCODING_MACROMANIAN :
2556 enc = kCFStringEncodingMacRomanian ;
2557 break ;
2558 case wxFONTENCODING_MACCELTIC :
2559 enc = kCFStringEncodingMacCeltic ;
2560 break ;
2561 case wxFONTENCODING_MACGAELIC :
2562 enc = kCFStringEncodingMacGaelic ;
2563 break ;
2564 // case wxFONTENCODING_MACKEYBOARD :
2565 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2566 // break ;
2567
2568 default :
2569 // because gcc is picky
2570 break ;
2571 }
2572
2573 return enc ;
2574 }
2575
2576 class wxMBConv_cocoa : public wxMBConv
2577 {
2578 public:
2579 wxMBConv_cocoa()
2580 {
2581 Init(CFStringGetSystemEncoding()) ;
2582 }
2583
2584 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2585 {
2586 m_encoding = conv.m_encoding;
2587 }
2588
2589 #if wxUSE_FONTMAP
2590 wxMBConv_cocoa(const wxChar* name)
2591 {
2592 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2593 }
2594 #endif
2595
2596 wxMBConv_cocoa(wxFontEncoding encoding)
2597 {
2598 Init( wxCFStringEncFromFontEnc(encoding) );
2599 }
2600
2601 virtual ~wxMBConv_cocoa()
2602 {
2603 }
2604
2605 void Init( CFStringEncoding encoding)
2606 {
2607 m_encoding = encoding ;
2608 }
2609
2610 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2611 {
2612 wxASSERT(szUnConv);
2613
2614 CFStringRef theString = CFStringCreateWithBytes (
2615 NULL, //the allocator
2616 (const UInt8*)szUnConv,
2617 strlen(szUnConv),
2618 m_encoding,
2619 false //no BOM/external representation
2620 );
2621
2622 wxASSERT(theString);
2623
2624 size_t nOutLength = CFStringGetLength(theString);
2625
2626 if (szOut == NULL)
2627 {
2628 CFRelease(theString);
2629 return nOutLength;
2630 }
2631
2632 CFRange theRange = { 0, nOutSize };
2633
2634 #if SIZEOF_WCHAR_T == 4
2635 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2636 #endif
2637
2638 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2639
2640 CFRelease(theString);
2641
2642 szUniCharBuffer[nOutLength] = '\0';
2643
2644 #if SIZEOF_WCHAR_T == 4
2645 wxMBConvUTF16 converter;
2646 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2647 delete [] szUniCharBuffer;
2648 #endif
2649
2650 return nOutLength;
2651 }
2652
2653 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2654 {
2655 wxASSERT(szUnConv);
2656
2657 size_t nRealOutSize;
2658 size_t nBufSize = wxWcslen(szUnConv);
2659 UniChar* szUniBuffer = (UniChar*) szUnConv;
2660
2661 #if SIZEOF_WCHAR_T == 4
2662 wxMBConvUTF16 converter ;
2663 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2664 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2665 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2666 nBufSize /= sizeof(UniChar);
2667 #endif
2668
2669 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2670 NULL, //allocator
2671 szUniBuffer,
2672 nBufSize,
2673 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2674 );
2675
2676 wxASSERT(theString);
2677
2678 //Note that CER puts a BOM when converting to unicode
2679 //so we check and use getchars instead in that case
2680 if (m_encoding == kCFStringEncodingUnicode)
2681 {
2682 if (szOut != NULL)
2683 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2684
2685 nRealOutSize = CFStringGetLength(theString) + 1;
2686 }
2687 else
2688 {
2689 CFStringGetBytes(
2690 theString,
2691 CFRangeMake(0, CFStringGetLength(theString)),
2692 m_encoding,
2693 0, //what to put in characters that can't be converted -
2694 //0 tells CFString to return NULL if it meets such a character
2695 false, //not an external representation
2696 (UInt8*) szOut,
2697 nOutSize,
2698 (CFIndex*) &nRealOutSize
2699 );
2700 }
2701
2702 CFRelease(theString);
2703
2704 #if SIZEOF_WCHAR_T == 4
2705 delete[] szUniBuffer;
2706 #endif
2707
2708 return nRealOutSize - 1;
2709 }
2710
2711 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2712
2713 bool IsOk() const
2714 {
2715 return m_encoding != kCFStringEncodingInvalidId &&
2716 CFStringIsEncodingAvailable(m_encoding);
2717 }
2718
2719 private:
2720 CFStringEncoding m_encoding ;
2721 };
2722
2723 #endif // defined(__WXCOCOA__)
2724
2725 // ============================================================================
2726 // Mac conversion classes
2727 // ============================================================================
2728
2729 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2730
2731 class wxMBConv_mac : public wxMBConv
2732 {
2733 public:
2734 wxMBConv_mac()
2735 {
2736 Init(CFStringGetSystemEncoding()) ;
2737 }
2738
2739 wxMBConv_mac(const wxMBConv_mac& conv)
2740 {
2741 Init(conv.m_char_encoding);
2742 }
2743
2744 #if wxUSE_FONTMAP
2745 wxMBConv_mac(const wxChar* name)
2746 {
2747 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2748 }
2749 #endif
2750
2751 wxMBConv_mac(wxFontEncoding encoding)
2752 {
2753 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2754 }
2755
2756 virtual ~wxMBConv_mac()
2757 {
2758 OSStatus status = noErr ;
2759 if (m_MB2WC_converter)
2760 status = TECDisposeConverter(m_MB2WC_converter);
2761 if (m_WC2MB_converter)
2762 status = TECDisposeConverter(m_WC2MB_converter);
2763 }
2764
2765 void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2766 TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2767 {
2768 m_MB2WC_converter = NULL ;
2769 m_WC2MB_converter = NULL ;
2770 m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2771 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2772 }
2773
2774 virtual void CreateIfNeeded() const
2775 {
2776 if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2777 {
2778 OSStatus status = noErr ;
2779 status = TECCreateConverter(&m_MB2WC_converter,
2780 m_char_encoding,
2781 m_unicode_encoding);
2782 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2783 status = TECCreateConverter(&m_WC2MB_converter,
2784 m_unicode_encoding,
2785 m_char_encoding);
2786 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2787 }
2788 }
2789
2790 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2791 {
2792 CreateIfNeeded() ;
2793 OSStatus status = noErr ;
2794 ByteCount byteOutLen ;
2795 ByteCount byteInLen = strlen(psz) + 1;
2796 wchar_t *tbuf = NULL ;
2797 UniChar* ubuf = NULL ;
2798 size_t res = 0 ;
2799
2800 if (buf == NULL)
2801 {
2802 // Apple specs say at least 32
2803 n = wxMax( 32, byteInLen ) ;
2804 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2805 }
2806
2807 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2808
2809 #if SIZEOF_WCHAR_T == 4
2810 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2811 #else
2812 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2813 #endif
2814
2815 status = TECConvertText(
2816 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2817 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2818
2819 #if SIZEOF_WCHAR_T == 4
2820 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2821 // is not properly terminated we get random characters at the end
2822 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2823 wxMBConvUTF16 converter ;
2824 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2825 free( ubuf ) ;
2826 #else
2827 res = byteOutLen / sizeof( UniChar ) ;
2828 #endif
2829
2830 if ( buf == NULL )
2831 free(tbuf) ;
2832
2833 if ( buf && res < n)
2834 buf[res] = 0;
2835
2836 return res ;
2837 }
2838
2839 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2840 {
2841 CreateIfNeeded() ;
2842 OSStatus status = noErr ;
2843 ByteCount byteOutLen ;
2844 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2845
2846 char *tbuf = NULL ;
2847
2848 if (buf == NULL)
2849 {
2850 // Apple specs say at least 32
2851 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2852 tbuf = (char*) malloc( n ) ;
2853 }
2854
2855 ByteCount byteBufferLen = n ;
2856 UniChar* ubuf = NULL ;
2857
2858 #if SIZEOF_WCHAR_T == 4
2859 wxMBConvUTF16 converter ;
2860 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2861 byteInLen = unicharlen ;
2862 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2863 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2864 #else
2865 ubuf = (UniChar*) psz ;
2866 #endif
2867
2868 status = TECConvertText(
2869 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2870 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2871
2872 #if SIZEOF_WCHAR_T == 4
2873 free( ubuf ) ;
2874 #endif
2875
2876 if ( buf == NULL )
2877 free(tbuf) ;
2878
2879 size_t res = byteOutLen ;
2880 if ( buf && res < n)
2881 {
2882 buf[res] = 0;
2883
2884 //we need to double-trip to verify it didn't insert any ? in place
2885 //of bogus characters
2886 wxWCharBuffer wcBuf(n);
2887 size_t pszlen = wxWcslen(psz);
2888 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2889 wxWcslen(wcBuf) != pszlen ||
2890 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2891 {
2892 // we didn't obtain the same thing we started from, hence
2893 // the conversion was lossy and we consider that it failed
2894 return wxCONV_FAILED;
2895 }
2896 }
2897
2898 return res ;
2899 }
2900
2901 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2902
2903 bool IsOk() const
2904 {
2905 CreateIfNeeded() ;
2906 return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2907 }
2908
2909 protected :
2910 mutable TECObjectRef m_MB2WC_converter;
2911 mutable TECObjectRef m_WC2MB_converter;
2912
2913 TextEncodingBase m_char_encoding;
2914 TextEncodingBase m_unicode_encoding;
2915 };
2916
2917 // MB is decomposed (D) normalized UTF8
2918
2919 class wxMBConv_macUTF8D : public wxMBConv_mac
2920 {
2921 public :
2922 wxMBConv_macUTF8D()
2923 {
2924 Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
2925 m_uni = NULL;
2926 m_uniBack = NULL ;
2927 }
2928
2929 virtual ~wxMBConv_macUTF8D()
2930 {
2931 if (m_uni!=NULL)
2932 DisposeUnicodeToTextInfo(&m_uni);
2933 if (m_uniBack!=NULL)
2934 DisposeUnicodeToTextInfo(&m_uniBack);
2935 }
2936
2937 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2938 {
2939 CreateIfNeeded() ;
2940 OSStatus status = noErr ;
2941 ByteCount byteOutLen ;
2942 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2943
2944 char *tbuf = NULL ;
2945
2946 if (buf == NULL)
2947 {
2948 // Apple specs say at least 32
2949 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2950 tbuf = (char*) malloc( n ) ;
2951 }
2952
2953 ByteCount byteBufferLen = n ;
2954 UniChar* ubuf = NULL ;
2955
2956 #if SIZEOF_WCHAR_T == 4
2957 wxMBConvUTF16 converter ;
2958 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2959 byteInLen = unicharlen ;
2960 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2961 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2962 #else
2963 ubuf = (UniChar*) psz ;
2964 #endif
2965
2966 // ubuf is a non-decomposed UniChar buffer
2967
2968 ByteCount dcubuflen = byteInLen * 2 + 2 ;
2969 ByteCount dcubufread , dcubufwritten ;
2970 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2971
2972 ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
2973 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , dcubuf ) ;
2974
2975 // we now convert that decomposed buffer into UTF8
2976
2977 status = TECConvertText(
2978 m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
2979 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2980
2981 free( dcubuf );
2982
2983 #if SIZEOF_WCHAR_T == 4
2984 free( ubuf ) ;
2985 #endif
2986
2987 if ( buf == NULL )
2988 free(tbuf) ;
2989
2990 size_t res = byteOutLen ;
2991 if ( buf && res < n)
2992 {
2993 buf[res] = 0;
2994 // don't test for round-trip fidelity yet, we cannot guarantee it yet
2995 }
2996
2997 return res ;
2998 }
2999
3000 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
3001 {
3002 CreateIfNeeded() ;
3003 OSStatus status = noErr ;
3004 ByteCount byteOutLen ;
3005 ByteCount byteInLen = strlen(psz) + 1;
3006 wchar_t *tbuf = NULL ;
3007 UniChar* ubuf = NULL ;
3008 size_t res = 0 ;
3009
3010 if (buf == NULL)
3011 {
3012 // Apple specs say at least 32
3013 n = wxMax( 32, byteInLen ) ;
3014 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
3015 }
3016
3017 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
3018
3019 #if SIZEOF_WCHAR_T == 4
3020 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
3021 #else
3022 ubuf = (UniChar*) (buf ? buf : tbuf) ;
3023 #endif
3024
3025 ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
3026 ByteCount dcubufread , dcubufwritten ;
3027 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3028
3029 status = TECConvertText(
3030 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
3031 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
3032 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3033 // is not properly terminated we get random characters at the end
3034 dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3035
3036 // now from the decomposed UniChar to properly composed uniChar
3037 ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
3038 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , ubuf ) ;
3039
3040 free( dcubuf );
3041 byteOutLen = dcubufwritten ;
3042 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3043
3044
3045 #if SIZEOF_WCHAR_T == 4
3046 wxMBConvUTF16 converter ;
3047 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
3048 free( ubuf ) ;
3049 #else
3050 res = byteOutLen / sizeof( UniChar ) ;
3051 #endif
3052
3053 if ( buf == NULL )
3054 free(tbuf) ;
3055
3056 if ( buf && res < n)
3057 buf[res] = 0;
3058
3059 return res ;
3060 }
3061
3062 virtual void CreateIfNeeded() const
3063 {
3064 wxMBConv_mac::CreateIfNeeded() ;
3065 if ( m_uni == NULL )
3066 {
3067 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3068 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3069 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3070 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
3071 m_map.mappingVersion = kUnicodeUseLatestMapping;
3072
3073 OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
3074 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3075
3076 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3077 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3078 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3079 kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
3080 m_map.mappingVersion = kUnicodeUseLatestMapping;
3081 err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
3082 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3083 }
3084 }
3085 protected :
3086 mutable UnicodeToTextInfo m_uni;
3087 mutable UnicodeToTextInfo m_uniBack;
3088 mutable UnicodeMapping m_map;
3089 };
3090 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3091
3092 // ============================================================================
3093 // wxEncodingConverter based conversion classes
3094 // ============================================================================
3095
3096 #if wxUSE_FONTMAP
3097
3098 class wxMBConv_wxwin : public wxMBConv
3099 {
3100 private:
3101 void Init()
3102 {
3103 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
3104 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
3105 }
3106
3107 public:
3108 // temporarily just use wxEncodingConverter stuff,
3109 // so that it works while a better implementation is built
3110 wxMBConv_wxwin(const wxChar* name)
3111 {
3112 if (name)
3113 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3114 else
3115 m_enc = wxFONTENCODING_SYSTEM;
3116
3117 Init();
3118 }
3119
3120 wxMBConv_wxwin(wxFontEncoding enc)
3121 {
3122 m_enc = enc;
3123
3124 Init();
3125 }
3126
3127 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
3128 {
3129 size_t inbuf = strlen(psz);
3130 if (buf)
3131 {
3132 if (!m2w.Convert(psz, buf))
3133 return wxCONV_FAILED;
3134 }
3135 return inbuf;
3136 }
3137
3138 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
3139 {
3140 const size_t inbuf = wxWcslen(psz);
3141 if (buf)
3142 {
3143 if (!w2m.Convert(psz, buf))
3144 return wxCONV_FAILED;
3145 }
3146
3147 return inbuf;
3148 }
3149
3150 virtual size_t GetMBNulLen() const
3151 {
3152 switch ( m_enc )
3153 {
3154 case wxFONTENCODING_UTF16BE:
3155 case wxFONTENCODING_UTF16LE:
3156 return 2;
3157
3158 case wxFONTENCODING_UTF32BE:
3159 case wxFONTENCODING_UTF32LE:
3160 return 4;
3161
3162 default:
3163 return 1;
3164 }
3165 }
3166
3167 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
3168
3169 bool IsOk() const { return m_ok; }
3170
3171 public:
3172 wxFontEncoding m_enc;
3173 wxEncodingConverter m2w, w2m;
3174
3175 private:
3176 // were we initialized successfully?
3177 bool m_ok;
3178
3179 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
3180 };
3181
3182 // make the constructors available for unit testing
3183 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
3184 {
3185 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
3186 if ( !result->IsOk() )
3187 {
3188 delete result;
3189 return 0;
3190 }
3191
3192 return result;
3193 }
3194
3195 #endif // wxUSE_FONTMAP
3196
3197 // ============================================================================
3198 // wxCSConv implementation
3199 // ============================================================================
3200
3201 void wxCSConv::Init()
3202 {
3203 m_name = NULL;
3204 m_convReal = NULL;
3205 m_deferred = true;
3206 }
3207
3208 wxCSConv::wxCSConv(const wxChar *charset)
3209 {
3210 Init();
3211
3212 if ( charset )
3213 {
3214 SetName(charset);
3215 }
3216
3217 #if wxUSE_FONTMAP
3218 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3219 #else
3220 m_encoding = wxFONTENCODING_SYSTEM;
3221 #endif
3222 }
3223
3224 wxCSConv::wxCSConv(wxFontEncoding encoding)
3225 {
3226 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3227 {
3228 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3229
3230 encoding = wxFONTENCODING_SYSTEM;
3231 }
3232
3233 Init();
3234
3235 m_encoding = encoding;
3236 }
3237
3238 wxCSConv::~wxCSConv()
3239 {
3240 Clear();
3241 }
3242
3243 wxCSConv::wxCSConv(const wxCSConv& conv)
3244 : wxMBConv()
3245 {
3246 Init();
3247
3248 SetName(conv.m_name);
3249 m_encoding = conv.m_encoding;
3250 }
3251
3252 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3253 {
3254 Clear();
3255
3256 SetName(conv.m_name);
3257 m_encoding = conv.m_encoding;
3258
3259 return *this;
3260 }
3261
3262 void wxCSConv::Clear()
3263 {
3264 free(m_name);
3265 delete m_convReal;
3266
3267 m_name = NULL;
3268 m_convReal = NULL;
3269 }
3270
3271 void wxCSConv::SetName(const wxChar *charset)
3272 {
3273 if (charset)
3274 {
3275 m_name = wxStrdup(charset);
3276 m_deferred = true;
3277 }
3278 }
3279
3280 #if wxUSE_FONTMAP
3281
3282 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3283 wxEncodingNameCache );
3284
3285 static wxEncodingNameCache gs_nameCache;
3286 #endif
3287
3288 wxMBConv *wxCSConv::DoCreate() const
3289 {
3290 #if wxUSE_FONTMAP
3291 wxLogTrace(TRACE_STRCONV,
3292 wxT("creating conversion for %s"),
3293 (m_name ? m_name
3294 : (const wxChar*)wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3295 #endif // wxUSE_FONTMAP
3296
3297 // check for the special case of ASCII or ISO8859-1 charset: as we have
3298 // special knowledge of it anyhow, we don't need to create a special
3299 // conversion object
3300 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3301 m_encoding == wxFONTENCODING_DEFAULT )
3302 {
3303 // don't convert at all
3304 return NULL;
3305 }
3306
3307 // we trust OS to do conversion better than we can so try external
3308 // conversion methods first
3309 //
3310 // the full order is:
3311 // 1. OS conversion (iconv() under Unix or Win32 API)
3312 // 2. hard coded conversions for UTF
3313 // 3. wxEncodingConverter as fall back
3314
3315 // step (1)
3316 #ifdef HAVE_ICONV
3317 #if !wxUSE_FONTMAP
3318 if ( m_name )
3319 #endif // !wxUSE_FONTMAP
3320 {
3321 wxString name(m_name);
3322 #if wxUSE_FONTMAP
3323 wxFontEncoding encoding(m_encoding);
3324 #endif
3325
3326 if ( !name.empty() )
3327 {
3328 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3329 if ( conv->IsOk() )
3330 return conv;
3331
3332 delete conv;
3333
3334 #if wxUSE_FONTMAP
3335 encoding =
3336 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3337 #endif // wxUSE_FONTMAP
3338 }
3339 #if wxUSE_FONTMAP
3340 {
3341 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3342 if ( it != gs_nameCache.end() )
3343 {
3344 if ( it->second.empty() )
3345 return NULL;
3346
3347 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3348 if ( conv->IsOk() )
3349 return conv;
3350
3351 delete conv;
3352 }
3353
3354 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3355 // CS : in case this does not return valid names (eg for MacRoman) encoding
3356 // got a 'failure' entry in the cache all the same, although it just has to
3357 // be created using a different method, so only store failed iconv creation
3358 // attempts (or perhaps we shoulnd't do this at all ?)
3359 if ( names[0] != NULL )
3360 {
3361 for ( ; *names; ++names )
3362 {
3363 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3364 if ( conv->IsOk() )
3365 {
3366 gs_nameCache[encoding] = *names;
3367 return conv;
3368 }
3369
3370 delete conv;
3371 }
3372
3373 gs_nameCache[encoding] = _T(""); // cache the failure
3374 }
3375 }
3376 #endif // wxUSE_FONTMAP
3377 }
3378 #endif // HAVE_ICONV
3379
3380 #ifdef wxHAVE_WIN32_MB2WC
3381 {
3382 #if wxUSE_FONTMAP
3383 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3384 : new wxMBConv_win32(m_encoding);
3385 if ( conv->IsOk() )
3386 return conv;
3387
3388 delete conv;
3389 #else
3390 return NULL;
3391 #endif
3392 }
3393 #endif // wxHAVE_WIN32_MB2WC
3394
3395 #if defined(__WXMAC__)
3396 {
3397 // leave UTF16 and UTF32 to the built-ins of wx
3398 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3399 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3400 {
3401 #if wxUSE_FONTMAP
3402 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3403 : new wxMBConv_mac(m_encoding);
3404 #else
3405 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3406 #endif
3407 if ( conv->IsOk() )
3408 return conv;
3409
3410 delete conv;
3411 }
3412 }
3413 #endif
3414
3415 #if defined(__WXCOCOA__)
3416 {
3417 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3418 {
3419 #if wxUSE_FONTMAP
3420 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3421 : new wxMBConv_cocoa(m_encoding);
3422 #else
3423 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3424 #endif
3425
3426 if ( conv->IsOk() )
3427 return conv;
3428
3429 delete conv;
3430 }
3431 }
3432 #endif
3433 // step (2)
3434 wxFontEncoding enc = m_encoding;
3435 #if wxUSE_FONTMAP
3436 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3437 {
3438 // use "false" to suppress interactive dialogs -- we can be called from
3439 // anywhere and popping up a dialog from here is the last thing we want to
3440 // do
3441 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3442 }
3443 #endif // wxUSE_FONTMAP
3444
3445 switch ( enc )
3446 {
3447 case wxFONTENCODING_UTF7:
3448 return new wxMBConvUTF7;
3449
3450 case wxFONTENCODING_UTF8:
3451 return new wxMBConvUTF8;
3452
3453 case wxFONTENCODING_UTF16BE:
3454 return new wxMBConvUTF16BE;
3455
3456 case wxFONTENCODING_UTF16LE:
3457 return new wxMBConvUTF16LE;
3458
3459 case wxFONTENCODING_UTF32BE:
3460 return new wxMBConvUTF32BE;
3461
3462 case wxFONTENCODING_UTF32LE:
3463 return new wxMBConvUTF32LE;
3464
3465 default:
3466 // nothing to do but put here to suppress gcc warnings
3467 break;
3468 }
3469
3470 // step (3)
3471 #if wxUSE_FONTMAP
3472 {
3473 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3474 : new wxMBConv_wxwin(m_encoding);
3475 if ( conv->IsOk() )
3476 return conv;
3477
3478 delete conv;
3479 }
3480 #endif // wxUSE_FONTMAP
3481
3482 // NB: This is a hack to prevent deadlock. What could otherwise happen
3483 // in Unicode build: wxConvLocal creation ends up being here
3484 // because of some failure and logs the error. But wxLog will try to
3485 // attach a timestamp, for which it will need wxConvLocal (to convert
3486 // time to char* and then wchar_t*), but that fails, tries to log the
3487 // error, but wxLog has an (already locked) critical section that
3488 // guards the static buffer.
3489 static bool alreadyLoggingError = false;
3490 if (!alreadyLoggingError)
3491 {
3492 alreadyLoggingError = true;
3493 wxLogError(_("Cannot convert from the charset '%s'!"),
3494 m_name ? m_name
3495 :
3496 #if wxUSE_FONTMAP
3497 (const wxChar*)wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3498 #else // !wxUSE_FONTMAP
3499 (const wxChar*)wxString::Format(_("encoding %i"), m_encoding).c_str()
3500 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3501 );
3502
3503 alreadyLoggingError = false;
3504 }
3505
3506 return NULL;
3507 }
3508
3509 void wxCSConv::CreateConvIfNeeded() const
3510 {
3511 if ( m_deferred )
3512 {
3513 wxCSConv *self = (wxCSConv *)this; // const_cast
3514
3515 // if we don't have neither the name nor the encoding, use the default
3516 // encoding for this system
3517 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3518 {
3519 #if wxUSE_INTL
3520 self->m_encoding = wxLocale::GetSystemEncoding();
3521 #else
3522 // fallback to some reasonable default:
3523 self->m_encoding = wxFONTENCODING_ISO8859_1;
3524 #endif // wxUSE_INTL
3525 }
3526
3527 self->m_convReal = DoCreate();
3528 self->m_deferred = false;
3529 }
3530 }
3531
3532 bool wxCSConv::IsOk() const
3533 {
3534 CreateConvIfNeeded();
3535
3536 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3537 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3538 return true; // always ok as we do it ourselves
3539
3540 // m_convReal->IsOk() is called at its own creation, so we know it must
3541 // be ok if m_convReal is non-NULL
3542 return m_convReal != NULL;
3543 }
3544
3545 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3546 const char *src, size_t srcLen) const
3547 {
3548 CreateConvIfNeeded();
3549
3550 if (m_convReal)
3551 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3552
3553 // latin-1 (direct)
3554 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3555 }
3556
3557 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3558 const wchar_t *src, size_t srcLen) const
3559 {
3560 CreateConvIfNeeded();
3561
3562 if (m_convReal)
3563 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3564
3565 // latin-1 (direct)
3566 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3567 }
3568
3569 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3570 {
3571 CreateConvIfNeeded();
3572
3573 if (m_convReal)
3574 return m_convReal->MB2WC(buf, psz, n);
3575
3576 // latin-1 (direct)
3577 size_t len = strlen(psz);
3578
3579 if (buf)
3580 {
3581 for (size_t c = 0; c <= len; c++)
3582 buf[c] = (unsigned char)(psz[c]);
3583 }
3584
3585 return len;
3586 }
3587
3588 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3589 {
3590 CreateConvIfNeeded();
3591
3592 if (m_convReal)
3593 return m_convReal->WC2MB(buf, psz, n);
3594
3595 // latin-1 (direct)
3596 const size_t len = wxWcslen(psz);
3597 if (buf)
3598 {
3599 for (size_t c = 0; c <= len; c++)
3600 {
3601 if (psz[c] > 0xFF)
3602 return wxCONV_FAILED;
3603
3604 buf[c] = (char)psz[c];
3605 }
3606 }
3607 else
3608 {
3609 for (size_t c = 0; c <= len; c++)
3610 {
3611 if (psz[c] > 0xFF)
3612 return wxCONV_FAILED;
3613 }
3614 }
3615
3616 return len;
3617 }
3618
3619 size_t wxCSConv::GetMBNulLen() const
3620 {
3621 CreateConvIfNeeded();
3622
3623 if ( m_convReal )
3624 {
3625 return m_convReal->GetMBNulLen();
3626 }
3627
3628 // otherwise, we are ISO-8859-1
3629 return 1;
3630 }
3631
3632 #if wxUSE_UNICODE_UTF8
3633 bool wxCSConv::IsUTF8() const
3634 {
3635 CreateConvIfNeeded();
3636
3637 if ( m_convReal )
3638 {
3639 return m_convReal->IsUTF8();
3640 }
3641
3642 // otherwise, we are ISO-8859-1
3643 return false;
3644 }
3645 #endif
3646
3647
3648 #if wxUSE_UNICODE
3649
3650 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3651 {
3652 if ( !s )
3653 return wxWCharBuffer();
3654
3655 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3656 if ( !wbuf )
3657 wbuf = wxMBConvUTF8().cMB2WX(s);
3658 if ( !wbuf )
3659 wbuf = wxConvISO8859_1.cMB2WX(s);
3660
3661 return wbuf;
3662 }
3663
3664 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3665 {
3666 if ( !ws )
3667 return wxCharBuffer();
3668
3669 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3670 if ( !buf )
3671 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3672
3673 return buf;
3674 }
3675
3676 #endif // wxUSE_UNICODE
3677
3678 // ----------------------------------------------------------------------------
3679 // globals
3680 // ----------------------------------------------------------------------------
3681
3682 // NB: The reason why we create converted objects in this convoluted way,
3683 // using a factory function instead of global variable, is that they
3684 // may be used at static initialization time (some of them are used by
3685 // wxString ctors and there may be a global wxString object). In other
3686 // words, possibly _before_ the converter global object would be
3687 // initialized.
3688
3689 #undef wxConvLibc
3690 #undef wxConvUTF8
3691 #undef wxConvUTF7
3692 #undef wxConvLocal
3693 #undef wxConvISO8859_1
3694
3695 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3696 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3697 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3698 { \
3699 static impl_klass name##Obj ctor_args; \
3700 return &name##Obj; \
3701 } \
3702 /* this ensures that all global converter objects are created */ \
3703 /* by the time static initialization is done, i.e. before any */ \
3704 /* thread is launched: */ \
3705 static klass* gs_##name##instance = wxGet_##name##Ptr()
3706
3707 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3708 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3709
3710 #ifdef __WINDOWS__
3711 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3712 #elif defined(__WXMAC__) && !defined(__MACH__)
3713 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_mac, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3714 #else
3715 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3716 #endif
3717
3718 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3719 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3720
3721 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3722 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3723
3724 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3725 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3726
3727 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3728 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3729 #endif
3730 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3731 #ifdef __WXOSX__
3732 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3733 &wxConvMacUTF8DObj;
3734 #else
3735 wxGet_wxConvUTF8Ptr();
3736 #endif
3737 #else // !__WXOSX__
3738 wxGet_wxConvLibcPtr();
3739 #endif // __WXOSX__/!__WXOSX__
3740
3741 #else // !wxUSE_WCHAR_T
3742
3743 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3744 // stand-ins in absence of wchar_t
3745 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3746 wxConvISO8859_1,
3747 wxConvLocal,
3748 wxConvUTF8;
3749
3750 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T