]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
default values for option and flag (TODO: preferences dialog)
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifndef WX_PRECOMP
19 #include "wx/intl.h"
20 #include "wx/log.h"
21 #include "wx/utils.h"
22 #include "wx/hashmap.h"
23 #endif
24
25 #include "wx/strconv.h"
26
27 #if wxUSE_WCHAR_T
28
29 #ifndef __WXWINCE__
30 #include <errno.h>
31 #endif
32
33 #include <ctype.h>
34 #include <string.h>
35 #include <stdlib.h>
36
37 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
38 #include "wx/msw/private.h"
39 #include "wx/msw/missing.h"
40 #define wxHAVE_WIN32_MB2WC
41 #endif
42
43 #ifdef __SALFORDC__
44 #include <clib.h>
45 #endif
46
47 #ifdef HAVE_ICONV
48 #include <iconv.h>
49 #include "wx/thread.h"
50 #endif
51
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
54
55 #ifdef __WXMAC__
56 #ifndef __DARWIN__
57 #include <ATSUnicode.h>
58 #include <TextCommon.h>
59 #include <TextEncodingConverter.h>
60 #endif
61
62 // includes Mac headers
63 #include "wx/mac/private.h"
64 #endif
65
66
67 #define TRACE_STRCONV _T("strconv")
68
69 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
70 // be 4 bytes
71 #if SIZEOF_WCHAR_T == 2
72 #define WC_UTF16
73 #endif
74
75
76 // ============================================================================
77 // implementation
78 // ============================================================================
79
80 // helper function of cMB2WC(): check if n bytes at this location are all NUL
81 static bool NotAllNULs(const char *p, size_t n)
82 {
83 while ( n && *p++ == '\0' )
84 n--;
85
86 return n != 0;
87 }
88
89 // ----------------------------------------------------------------------------
90 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
91 // ----------------------------------------------------------------------------
92
93 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
94 {
95 if (input <= 0xffff)
96 {
97 if (output)
98 *output = (wxUint16) input;
99
100 return 1;
101 }
102 else if (input >= 0x110000)
103 {
104 return wxCONV_FAILED;
105 }
106 else
107 {
108 if (output)
109 {
110 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
111 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
112 }
113
114 return 2;
115 }
116 }
117
118 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
119 {
120 if ((*input < 0xd800) || (*input > 0xdfff))
121 {
122 output = *input;
123 return 1;
124 }
125 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
126 {
127 output = *input;
128 return wxCONV_FAILED;
129 }
130 else
131 {
132 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
133 return 2;
134 }
135 }
136
137 #ifdef WC_UTF16
138 typedef wchar_t wxDecodeSurrogate_t;
139 #else // !WC_UTF16
140 typedef wxUint16 wxDecodeSurrogate_t;
141 #endif // WC_UTF16/!WC_UTF16
142
143 // returns the next UTF-32 character from the wchar_t buffer and advances the
144 // pointer to the character after this one
145 //
146 // if an invalid character is found, *pSrc is set to NULL, the caller must
147 // check for this
148 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
149 {
150 wxUint32 out;
151 const size_t
152 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
153 if ( n == wxCONV_FAILED )
154 *pSrc = NULL;
155 else
156 *pSrc += n;
157
158 return out;
159 }
160
161 // ----------------------------------------------------------------------------
162 // wxMBConv
163 // ----------------------------------------------------------------------------
164
165 size_t
166 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
167 const char *src, size_t srcLen) const
168 {
169 // although new conversion classes are supposed to implement this function
170 // directly, the existins ones only implement the old MB2WC() and so, to
171 // avoid to have to rewrite all conversion classes at once, we provide a
172 // default (but not efficient) implementation of this one in terms of the
173 // old function by copying the input to ensure that it's NUL-terminated and
174 // then using MB2WC() to convert it
175
176 // the number of chars [which would be] written to dst [if it were not NULL]
177 size_t dstWritten = 0;
178
179 // the number of NULs terminating this string
180 size_t nulLen = 0; // not really needed, but just to avoid warnings
181
182 // if we were not given the input size we just have to assume that the
183 // string is properly terminated as we have no way of knowing how long it
184 // is anyhow, but if we do have the size check whether there are enough
185 // NULs at the end
186 wxCharBuffer bufTmp;
187 const char *srcEnd;
188 if ( srcLen != wxNO_LEN )
189 {
190 // we need to know how to find the end of this string
191 nulLen = GetMBNulLen();
192 if ( nulLen == wxCONV_FAILED )
193 return wxCONV_FAILED;
194
195 // if there are enough NULs we can avoid the copy
196 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
197 {
198 // make a copy in order to properly NUL-terminate the string
199 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
200 char * const p = bufTmp.data();
201 memcpy(p, src, srcLen);
202 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
203 *s = '\0';
204
205 src = bufTmp;
206 }
207
208 srcEnd = src + srcLen;
209 }
210 else // quit after the first loop iteration
211 {
212 srcEnd = NULL;
213 }
214
215 for ( ;; )
216 {
217 // try to convert the current chunk
218 size_t lenChunk = MB2WC(NULL, src, 0);
219 if ( lenChunk == wxCONV_FAILED )
220 return wxCONV_FAILED;
221
222 lenChunk++; // for the L'\0' at the end of this chunk
223
224 dstWritten += lenChunk;
225
226 if ( lenChunk == 1 )
227 {
228 // nothing left in the input string, conversion succeeded
229 break;
230 }
231
232 if ( dst )
233 {
234 if ( dstWritten > dstLen )
235 return wxCONV_FAILED;
236
237 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
238 return wxCONV_FAILED;
239
240 dst += lenChunk;
241 }
242
243 if ( !srcEnd )
244 {
245 // we convert just one chunk in this case as this is the entire
246 // string anyhow
247 break;
248 }
249
250 // advance the input pointer past the end of this chunk
251 while ( NotAllNULs(src, nulLen) )
252 {
253 // notice that we must skip over multiple bytes here as we suppose
254 // that if NUL takes 2 or 4 bytes, then all the other characters do
255 // too and so if advanced by a single byte we might erroneously
256 // detect sequences of NUL bytes in the middle of the input
257 src += nulLen;
258 }
259
260 src += nulLen; // skipping over its terminator as well
261
262 // note that ">=" (and not just "==") is needed here as the terminator
263 // we skipped just above could be inside or just after the buffer
264 // delimited by inEnd
265 if ( src >= srcEnd )
266 break;
267 }
268
269 return dstWritten;
270 }
271
272 size_t
273 wxMBConv::FromWChar(char *dst, size_t dstLen,
274 const wchar_t *src, size_t srcLen) const
275 {
276 // the number of chars [which would be] written to dst [if it were not NULL]
277 size_t dstWritten = 0;
278
279 // make a copy of the input string unless it is already properly
280 // NUL-terminated
281 //
282 // if we don't know its length we have no choice but to assume that it is,
283 // indeed, properly terminated
284 wxWCharBuffer bufTmp;
285 if ( srcLen == wxNO_LEN )
286 {
287 srcLen = wxWcslen(src) + 1;
288 }
289 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
290 {
291 // make a copy in order to properly NUL-terminate the string
292 bufTmp = wxWCharBuffer(srcLen);
293 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
294 src = bufTmp;
295 }
296
297 const size_t lenNul = GetMBNulLen();
298 for ( const wchar_t * const srcEnd = src + srcLen;
299 src < srcEnd;
300 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
301 {
302 // try to convert the current chunk
303 size_t lenChunk = WC2MB(NULL, src, 0);
304
305 if ( lenChunk == wxCONV_FAILED )
306 return wxCONV_FAILED;
307
308 lenChunk += lenNul;
309 dstWritten += lenChunk;
310
311 if ( dst )
312 {
313 if ( dstWritten > dstLen )
314 return wxCONV_FAILED;
315
316 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
317 return wxCONV_FAILED;
318
319 dst += lenChunk;
320 }
321 }
322
323 return dstWritten;
324 }
325
326 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
327 {
328 size_t rc = ToWChar(outBuff, outLen, inBuff);
329 if ( rc != wxCONV_FAILED )
330 {
331 // ToWChar() returns the buffer length, i.e. including the trailing
332 // NUL, while this method doesn't take it into account
333 rc--;
334 }
335
336 return rc;
337 }
338
339 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
340 {
341 size_t rc = FromWChar(outBuff, outLen, inBuff);
342 if ( rc != wxCONV_FAILED )
343 {
344 rc -= GetMBNulLen();
345 }
346
347 return rc;
348 }
349
350 wxMBConv::~wxMBConv()
351 {
352 // nothing to do here (necessary for Darwin linking probably)
353 }
354
355 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
356 {
357 if ( psz )
358 {
359 // calculate the length of the buffer needed first
360 const size_t nLen = MB2WC(NULL, psz, 0);
361 if ( nLen != wxCONV_FAILED )
362 {
363 // now do the actual conversion
364 wxWCharBuffer buf(nLen /* +1 added implicitly */);
365
366 // +1 for the trailing NULL
367 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
368 return buf;
369 }
370 }
371
372 return wxWCharBuffer();
373 }
374
375 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
376 {
377 if ( pwz )
378 {
379 const size_t nLen = WC2MB(NULL, pwz, 0);
380 if ( nLen != wxCONV_FAILED )
381 {
382 // extra space for trailing NUL(s)
383 static const size_t extraLen = GetMaxMBNulLen();
384
385 wxCharBuffer buf(nLen + extraLen - 1);
386 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
387 return buf;
388 }
389 }
390
391 return wxCharBuffer();
392 }
393
394 const wxWCharBuffer
395 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
396 {
397 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
398 if ( dstLen != wxCONV_FAILED )
399 {
400 wxWCharBuffer wbuf(dstLen - 1);
401 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
402 {
403 if ( outLen )
404 {
405 *outLen = dstLen;
406 if ( wbuf[dstLen - 1] == L'\0' )
407 (*outLen)--;
408 }
409
410 return wbuf;
411 }
412 }
413
414 if ( outLen )
415 *outLen = 0;
416
417 return wxWCharBuffer();
418 }
419
420 const wxCharBuffer
421 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
422 {
423 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
424 if ( dstLen != wxCONV_FAILED )
425 {
426 // special case of empty input: can't allocate 0 size buffer below as
427 // wxCharBuffer insists on NUL-terminating it
428 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
429 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
430 {
431 if ( outLen )
432 {
433 *outLen = dstLen;
434
435 const size_t nulLen = GetMBNulLen();
436 if ( dstLen >= nulLen &&
437 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
438 {
439 // in this case the output is NUL-terminated and we're not
440 // supposed to count NUL
441 *outLen -= nulLen;
442 }
443 }
444
445 return buf;
446 }
447 }
448
449 if ( outLen )
450 *outLen = 0;
451
452 return wxCharBuffer();
453 }
454
455 // ----------------------------------------------------------------------------
456 // wxMBConvLibc
457 // ----------------------------------------------------------------------------
458
459 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
460 {
461 return wxMB2WC(buf, psz, n);
462 }
463
464 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
465 {
466 return wxWC2MB(buf, psz, n);
467 }
468
469 // ----------------------------------------------------------------------------
470 // wxConvBrokenFileNames
471 // ----------------------------------------------------------------------------
472
473 #ifdef __UNIX__
474
475 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
476 {
477 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
478 || wxStricmp(charset, _T("UTF8")) == 0 )
479 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
480 else
481 m_conv = new wxCSConv(charset);
482 }
483
484 #endif // __UNIX__
485
486 // ----------------------------------------------------------------------------
487 // UTF-7
488 // ----------------------------------------------------------------------------
489
490 // Implementation (C) 2004 Fredrik Roubert
491
492 //
493 // BASE64 decoding table
494 //
495 static const unsigned char utf7unb64[] =
496 {
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
503 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
504 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
506 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
507 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
508 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
510 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
511 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
512 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
529 };
530
531 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
532 {
533 size_t len = 0;
534
535 while ( *psz && (!buf || (len < n)) )
536 {
537 unsigned char cc = *psz++;
538 if (cc != '+')
539 {
540 // plain ASCII char
541 if (buf)
542 *buf++ = cc;
543 len++;
544 }
545 else if (*psz == '-')
546 {
547 // encoded plus sign
548 if (buf)
549 *buf++ = cc;
550 len++;
551 psz++;
552 }
553 else // start of BASE64 encoded string
554 {
555 bool lsb, ok;
556 unsigned int d, l;
557 for ( ok = lsb = false, d = 0, l = 0;
558 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
559 psz++ )
560 {
561 d <<= 6;
562 d += cc;
563 for (l += 6; l >= 8; lsb = !lsb)
564 {
565 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
566 if (lsb)
567 {
568 if (buf)
569 *buf++ |= c;
570 len ++;
571 }
572 else
573 {
574 if (buf)
575 *buf = (wchar_t)(c << 8);
576 }
577
578 ok = true;
579 }
580 }
581
582 if ( !ok )
583 {
584 // in valid UTF7 we should have valid characters after '+'
585 return wxCONV_FAILED;
586 }
587
588 if (*psz == '-')
589 psz++;
590 }
591 }
592
593 if ( buf && (len < n) )
594 *buf = '\0';
595
596 return len;
597 }
598
599 //
600 // BASE64 encoding table
601 //
602 static const unsigned char utf7enb64[] =
603 {
604 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
605 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
606 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
607 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
608 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
609 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
610 'w', 'x', 'y', 'z', '0', '1', '2', '3',
611 '4', '5', '6', '7', '8', '9', '+', '/'
612 };
613
614 //
615 // UTF-7 encoding table
616 //
617 // 0 - Set D (directly encoded characters)
618 // 1 - Set O (optional direct characters)
619 // 2 - whitespace characters (optional)
620 // 3 - special characters
621 //
622 static const unsigned char utf7encode[128] =
623 {
624 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
625 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
626 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
628 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
630 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
632 };
633
634 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
635 {
636 size_t len = 0;
637
638 while (*psz && ((!buf) || (len < n)))
639 {
640 wchar_t cc = *psz++;
641 if (cc < 0x80 && utf7encode[cc] < 1)
642 {
643 // plain ASCII char
644 if (buf)
645 *buf++ = (char)cc;
646
647 len++;
648 }
649 #ifndef WC_UTF16
650 else if (((wxUint32)cc) > 0xffff)
651 {
652 // no surrogate pair generation (yet?)
653 return wxCONV_FAILED;
654 }
655 #endif
656 else
657 {
658 if (buf)
659 *buf++ = '+';
660
661 len++;
662 if (cc != '+')
663 {
664 // BASE64 encode string
665 unsigned int lsb, d, l;
666 for (d = 0, l = 0; /*nothing*/; psz++)
667 {
668 for (lsb = 0; lsb < 2; lsb ++)
669 {
670 d <<= 8;
671 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
672
673 for (l += 8; l >= 6; )
674 {
675 l -= 6;
676 if (buf)
677 *buf++ = utf7enb64[(d >> l) % 64];
678 len++;
679 }
680 }
681
682 cc = *psz;
683 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
684 break;
685 }
686
687 if (l != 0)
688 {
689 if (buf)
690 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
691
692 len++;
693 }
694 }
695
696 if (buf)
697 *buf++ = '-';
698 len++;
699 }
700 }
701
702 if (buf && (len < n))
703 *buf = 0;
704
705 return len;
706 }
707
708 // ----------------------------------------------------------------------------
709 // UTF-8
710 // ----------------------------------------------------------------------------
711
712 static wxUint32 utf8_max[]=
713 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
714
715 // boundaries of the private use area we use to (temporarily) remap invalid
716 // characters invalid in a UTF-8 encoded string
717 const wxUint32 wxUnicodePUA = 0x100000;
718 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
719
720 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
721 {
722 size_t len = 0;
723
724 while (*psz && ((!buf) || (len < n)))
725 {
726 const char *opsz = psz;
727 bool invalid = false;
728 unsigned char cc = *psz++, fc = cc;
729 unsigned cnt;
730 for (cnt = 0; fc & 0x80; cnt++)
731 fc <<= 1;
732
733 if (!cnt)
734 {
735 // plain ASCII char
736 if (buf)
737 *buf++ = cc;
738 len++;
739
740 // escape the escape character for octal escapes
741 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
742 && cc == '\\' && (!buf || len < n))
743 {
744 if (buf)
745 *buf++ = cc;
746 len++;
747 }
748 }
749 else
750 {
751 cnt--;
752 if (!cnt)
753 {
754 // invalid UTF-8 sequence
755 invalid = true;
756 }
757 else
758 {
759 unsigned ocnt = cnt - 1;
760 wxUint32 res = cc & (0x3f >> cnt);
761 while (cnt--)
762 {
763 cc = *psz;
764 if ((cc & 0xC0) != 0x80)
765 {
766 // invalid UTF-8 sequence
767 invalid = true;
768 break;
769 }
770
771 psz++;
772 res = (res << 6) | (cc & 0x3f);
773 }
774
775 if (invalid || res <= utf8_max[ocnt])
776 {
777 // illegal UTF-8 encoding
778 invalid = true;
779 }
780 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
781 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
782 {
783 // if one of our PUA characters turns up externally
784 // it must also be treated as an illegal sequence
785 // (a bit like you have to escape an escape character)
786 invalid = true;
787 }
788 else
789 {
790 #ifdef WC_UTF16
791 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
792 size_t pa = encode_utf16(res, (wxUint16 *)buf);
793 if (pa == wxCONV_FAILED)
794 {
795 invalid = true;
796 }
797 else
798 {
799 if (buf)
800 buf += pa;
801 len += pa;
802 }
803 #else // !WC_UTF16
804 if (buf)
805 *buf++ = (wchar_t)res;
806 len++;
807 #endif // WC_UTF16/!WC_UTF16
808 }
809 }
810
811 if (invalid)
812 {
813 if (m_options & MAP_INVALID_UTF8_TO_PUA)
814 {
815 while (opsz < psz && (!buf || len < n))
816 {
817 #ifdef WC_UTF16
818 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
819 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
820 wxASSERT(pa != wxCONV_FAILED);
821 if (buf)
822 buf += pa;
823 opsz++;
824 len += pa;
825 #else
826 if (buf)
827 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
828 opsz++;
829 len++;
830 #endif
831 }
832 }
833 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
834 {
835 while (opsz < psz && (!buf || len < n))
836 {
837 if ( buf && len + 3 < n )
838 {
839 unsigned char on = *opsz;
840 *buf++ = L'\\';
841 *buf++ = (wchar_t)( L'0' + on / 0100 );
842 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
843 *buf++ = (wchar_t)( L'0' + on % 010 );
844 }
845
846 opsz++;
847 len += 4;
848 }
849 }
850 else // MAP_INVALID_UTF8_NOT
851 {
852 return wxCONV_FAILED;
853 }
854 }
855 }
856 }
857
858 if (buf && (len < n))
859 *buf = 0;
860
861 return len;
862 }
863
864 static inline bool isoctal(wchar_t wch)
865 {
866 return L'0' <= wch && wch <= L'7';
867 }
868
869 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
870 {
871 size_t len = 0;
872
873 while (*psz && ((!buf) || (len < n)))
874 {
875 wxUint32 cc;
876
877 #ifdef WC_UTF16
878 // cast is ok for WC_UTF16
879 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
880 psz += (pa == wxCONV_FAILED) ? 1 : pa;
881 #else
882 cc = (*psz++) & 0x7fffffff;
883 #endif
884
885 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
886 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
887 {
888 if (buf)
889 *buf++ = (char)(cc - wxUnicodePUA);
890 len++;
891 }
892 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
893 && cc == L'\\' && psz[0] == L'\\' )
894 {
895 if (buf)
896 *buf++ = (char)cc;
897 psz++;
898 len++;
899 }
900 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
901 cc == L'\\' &&
902 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
903 {
904 if (buf)
905 {
906 *buf++ = (char) ((psz[0] - L'0') * 0100 +
907 (psz[1] - L'0') * 010 +
908 (psz[2] - L'0'));
909 }
910
911 psz += 3;
912 len++;
913 }
914 else
915 {
916 unsigned cnt;
917 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
918 {
919 }
920
921 if (!cnt)
922 {
923 // plain ASCII char
924 if (buf)
925 *buf++ = (char) cc;
926 len++;
927 }
928 else
929 {
930 len += cnt + 1;
931 if (buf)
932 {
933 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
934 while (cnt--)
935 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
936 }
937 }
938 }
939 }
940
941 if (buf && (len < n))
942 *buf = 0;
943
944 return len;
945 }
946
947 // ============================================================================
948 // UTF-16
949 // ============================================================================
950
951 #ifdef WORDS_BIGENDIAN
952 #define wxMBConvUTF16straight wxMBConvUTF16BE
953 #define wxMBConvUTF16swap wxMBConvUTF16LE
954 #else
955 #define wxMBConvUTF16swap wxMBConvUTF16BE
956 #define wxMBConvUTF16straight wxMBConvUTF16LE
957 #endif
958
959 /* static */
960 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
961 {
962 if ( srcLen == wxNO_LEN )
963 {
964 // count the number of bytes in input, including the trailing NULs
965 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
966 for ( srcLen = 1; *inBuff++; srcLen++ )
967 ;
968
969 srcLen *= BYTES_PER_CHAR;
970 }
971 else // we already have the length
972 {
973 // we can only convert an entire number of UTF-16 characters
974 if ( srcLen % BYTES_PER_CHAR )
975 return wxCONV_FAILED;
976 }
977
978 return srcLen;
979 }
980
981 // case when in-memory representation is UTF-16 too
982 #ifdef WC_UTF16
983
984 // ----------------------------------------------------------------------------
985 // conversions without endianness change
986 // ----------------------------------------------------------------------------
987
988 size_t
989 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
990 const char *src, size_t srcLen) const
991 {
992 // set up the scene for using memcpy() (which is presumably more efficient
993 // than copying the bytes one by one)
994 srcLen = GetLength(src, srcLen);
995 if ( srcLen == wxNO_LEN )
996 return wxCONV_FAILED;
997
998 const size_t inLen = srcLen / BYTES_PER_CHAR;
999 if ( dst )
1000 {
1001 if ( dstLen < inLen )
1002 return wxCONV_FAILED;
1003
1004 memcpy(dst, src, srcLen);
1005 }
1006
1007 return inLen;
1008 }
1009
1010 size_t
1011 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1012 const wchar_t *src, size_t srcLen) const
1013 {
1014 if ( srcLen == wxNO_LEN )
1015 srcLen = wxWcslen(src) + 1;
1016
1017 srcLen *= BYTES_PER_CHAR;
1018
1019 if ( dst )
1020 {
1021 if ( dstLen < srcLen )
1022 return wxCONV_FAILED;
1023
1024 memcpy(dst, src, srcLen);
1025 }
1026
1027 return srcLen;
1028 }
1029
1030 // ----------------------------------------------------------------------------
1031 // endian-reversing conversions
1032 // ----------------------------------------------------------------------------
1033
1034 size_t
1035 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1036 const char *src, size_t srcLen) const
1037 {
1038 srcLen = GetLength(src, srcLen);
1039 if ( srcLen == wxNO_LEN )
1040 return wxCONV_FAILED;
1041
1042 srcLen /= BYTES_PER_CHAR;
1043
1044 if ( dst )
1045 {
1046 if ( dstLen < srcLen )
1047 return wxCONV_FAILED;
1048
1049 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1050 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1051 {
1052 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1053 }
1054 }
1055
1056 return srcLen;
1057 }
1058
1059 size_t
1060 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1061 const wchar_t *src, size_t srcLen) const
1062 {
1063 if ( srcLen == wxNO_LEN )
1064 srcLen = wxWcslen(src) + 1;
1065
1066 srcLen *= BYTES_PER_CHAR;
1067
1068 if ( dst )
1069 {
1070 if ( dstLen < srcLen )
1071 return wxCONV_FAILED;
1072
1073 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1074 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1075 {
1076 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1077 }
1078 }
1079
1080 return srcLen;
1081 }
1082
1083 #else // !WC_UTF16: wchar_t is UTF-32
1084
1085 // ----------------------------------------------------------------------------
1086 // conversions without endianness change
1087 // ----------------------------------------------------------------------------
1088
1089 size_t
1090 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1091 const char *src, size_t srcLen) const
1092 {
1093 srcLen = GetLength(src, srcLen);
1094 if ( srcLen == wxNO_LEN )
1095 return wxCONV_FAILED;
1096
1097 const size_t inLen = srcLen / BYTES_PER_CHAR;
1098 if ( !dst )
1099 {
1100 // optimization: return maximal space which could be needed for this
1101 // string even if the real size could be smaller if the buffer contains
1102 // any surrogates
1103 return inLen;
1104 }
1105
1106 size_t outLen = 0;
1107 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1108 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1109 {
1110 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1111 if ( !inBuff )
1112 return wxCONV_FAILED;
1113
1114 if ( ++outLen > dstLen )
1115 return wxCONV_FAILED;
1116
1117 *dst++ = ch;
1118 }
1119
1120
1121 return outLen;
1122 }
1123
1124 size_t
1125 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1126 const wchar_t *src, size_t srcLen) const
1127 {
1128 if ( srcLen == wxNO_LEN )
1129 srcLen = wxWcslen(src) + 1;
1130
1131 size_t outLen = 0;
1132 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1133 for ( size_t n = 0; n < srcLen; n++ )
1134 {
1135 wxUint16 cc[2];
1136 const size_t numChars = encode_utf16(*src++, cc);
1137 if ( numChars == wxCONV_FAILED )
1138 return wxCONV_FAILED;
1139
1140 outLen += numChars * BYTES_PER_CHAR;
1141 if ( outBuff )
1142 {
1143 if ( outLen > dstLen )
1144 return wxCONV_FAILED;
1145
1146 *outBuff++ = cc[0];
1147 if ( numChars == 2 )
1148 {
1149 // second character of a surrogate
1150 *outBuff++ = cc[1];
1151 }
1152 }
1153 }
1154
1155 return outLen;
1156 }
1157
1158 // ----------------------------------------------------------------------------
1159 // endian-reversing conversions
1160 // ----------------------------------------------------------------------------
1161
1162 size_t
1163 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1164 const char *src, size_t srcLen) const
1165 {
1166 srcLen = GetLength(src, srcLen);
1167 if ( srcLen == wxNO_LEN )
1168 return wxCONV_FAILED;
1169
1170 const size_t inLen = srcLen / BYTES_PER_CHAR;
1171 if ( !dst )
1172 {
1173 // optimization: return maximal space which could be needed for this
1174 // string even if the real size could be smaller if the buffer contains
1175 // any surrogates
1176 return inLen;
1177 }
1178
1179 size_t outLen = 0;
1180 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1181 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1182 {
1183 wxUint32 ch;
1184 wxUint16 tmp[2];
1185
1186 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1187 inBuff++;
1188 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1189
1190 const size_t numChars = decode_utf16(tmp, ch);
1191 if ( numChars == wxCONV_FAILED )
1192 return wxCONV_FAILED;
1193
1194 if ( numChars == 2 )
1195 inBuff++;
1196
1197 if ( ++outLen > dstLen )
1198 return wxCONV_FAILED;
1199
1200 *dst++ = ch;
1201 }
1202
1203
1204 return outLen;
1205 }
1206
1207 size_t
1208 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1209 const wchar_t *src, size_t srcLen) const
1210 {
1211 if ( srcLen == wxNO_LEN )
1212 srcLen = wxWcslen(src) + 1;
1213
1214 size_t outLen = 0;
1215 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1216 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1217 {
1218 wxUint16 cc[2];
1219 const size_t numChars = encode_utf16(*src, cc);
1220 if ( numChars == wxCONV_FAILED )
1221 return wxCONV_FAILED;
1222
1223 outLen += numChars * BYTES_PER_CHAR;
1224 if ( outBuff )
1225 {
1226 if ( outLen > dstLen )
1227 return wxCONV_FAILED;
1228
1229 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1230 if ( numChars == 2 )
1231 {
1232 // second character of a surrogate
1233 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1234 }
1235 }
1236 }
1237
1238 return outLen;
1239 }
1240
1241 #endif // WC_UTF16/!WC_UTF16
1242
1243
1244 // ============================================================================
1245 // UTF-32
1246 // ============================================================================
1247
1248 #ifdef WORDS_BIGENDIAN
1249 #define wxMBConvUTF32straight wxMBConvUTF32BE
1250 #define wxMBConvUTF32swap wxMBConvUTF32LE
1251 #else
1252 #define wxMBConvUTF32swap wxMBConvUTF32BE
1253 #define wxMBConvUTF32straight wxMBConvUTF32LE
1254 #endif
1255
1256
1257 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1258 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1259
1260 /* static */
1261 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1262 {
1263 if ( srcLen == wxNO_LEN )
1264 {
1265 // count the number of bytes in input, including the trailing NULs
1266 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1267 for ( srcLen = 1; *inBuff++; srcLen++ )
1268 ;
1269
1270 srcLen *= BYTES_PER_CHAR;
1271 }
1272 else // we already have the length
1273 {
1274 // we can only convert an entire number of UTF-32 characters
1275 if ( srcLen % BYTES_PER_CHAR )
1276 return wxCONV_FAILED;
1277 }
1278
1279 return srcLen;
1280 }
1281
1282 // case when in-memory representation is UTF-16
1283 #ifdef WC_UTF16
1284
1285 // ----------------------------------------------------------------------------
1286 // conversions without endianness change
1287 // ----------------------------------------------------------------------------
1288
1289 size_t
1290 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1291 const char *src, size_t srcLen) const
1292 {
1293 srcLen = GetLength(src, srcLen);
1294 if ( srcLen == wxNO_LEN )
1295 return wxCONV_FAILED;
1296
1297 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1298 const size_t inLen = srcLen / BYTES_PER_CHAR;
1299 size_t outLen = 0;
1300 for ( size_t n = 0; n < inLen; n++ )
1301 {
1302 wxUint16 cc[2];
1303 const size_t numChars = encode_utf16(*inBuff++, cc);
1304 if ( numChars == wxCONV_FAILED )
1305 return wxCONV_FAILED;
1306
1307 outLen += numChars;
1308 if ( dst )
1309 {
1310 if ( outLen > dstLen )
1311 return wxCONV_FAILED;
1312
1313 *dst++ = cc[0];
1314 if ( numChars == 2 )
1315 {
1316 // second character of a surrogate
1317 *dst++ = cc[1];
1318 }
1319 }
1320 }
1321
1322 return outLen;
1323 }
1324
1325 size_t
1326 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1327 const wchar_t *src, size_t srcLen) const
1328 {
1329 if ( srcLen == wxNO_LEN )
1330 srcLen = wxWcslen(src) + 1;
1331
1332 if ( !dst )
1333 {
1334 // optimization: return maximal space which could be needed for this
1335 // string instead of the exact amount which could be less if there are
1336 // any surrogates in the input
1337 //
1338 // we consider that surrogates are rare enough to make it worthwhile to
1339 // avoid running the loop below at the cost of slightly extra memory
1340 // consumption
1341 return srcLen * BYTES_PER_CHAR;
1342 }
1343
1344 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1345 size_t outLen = 0;
1346 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1347 {
1348 const wxUint32 ch = wxDecodeSurrogate(&src);
1349 if ( !src )
1350 return wxCONV_FAILED;
1351
1352 outLen += BYTES_PER_CHAR;
1353
1354 if ( outLen > dstLen )
1355 return wxCONV_FAILED;
1356
1357 *outBuff++ = ch;
1358 }
1359
1360 return outLen;
1361 }
1362
1363 // ----------------------------------------------------------------------------
1364 // endian-reversing conversions
1365 // ----------------------------------------------------------------------------
1366
1367 size_t
1368 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1369 const char *src, size_t srcLen) const
1370 {
1371 srcLen = GetLength(src, srcLen);
1372 if ( srcLen == wxNO_LEN )
1373 return wxCONV_FAILED;
1374
1375 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1376 const size_t inLen = srcLen / BYTES_PER_CHAR;
1377 size_t outLen = 0;
1378 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1379 {
1380 wxUint16 cc[2];
1381 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1382 if ( numChars == wxCONV_FAILED )
1383 return wxCONV_FAILED;
1384
1385 outLen += numChars;
1386 if ( dst )
1387 {
1388 if ( outLen > dstLen )
1389 return wxCONV_FAILED;
1390
1391 *dst++ = cc[0];
1392 if ( numChars == 2 )
1393 {
1394 // second character of a surrogate
1395 *dst++ = cc[1];
1396 }
1397 }
1398 }
1399
1400 return outLen;
1401 }
1402
1403 size_t
1404 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1405 const wchar_t *src, size_t srcLen) const
1406 {
1407 if ( srcLen == wxNO_LEN )
1408 srcLen = wxWcslen(src) + 1;
1409
1410 if ( !dst )
1411 {
1412 // optimization: return maximal space which could be needed for this
1413 // string instead of the exact amount which could be less if there are
1414 // any surrogates in the input
1415 //
1416 // we consider that surrogates are rare enough to make it worthwhile to
1417 // avoid running the loop below at the cost of slightly extra memory
1418 // consumption
1419 return srcLen*BYTES_PER_CHAR;
1420 }
1421
1422 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1423 size_t outLen = 0;
1424 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1425 {
1426 const wxUint32 ch = wxDecodeSurrogate(&src);
1427 if ( !src )
1428 return wxCONV_FAILED;
1429
1430 outLen += BYTES_PER_CHAR;
1431
1432 if ( outLen > dstLen )
1433 return wxCONV_FAILED;
1434
1435 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1436 }
1437
1438 return outLen;
1439 }
1440
1441 #else // !WC_UTF16: wchar_t is UTF-32
1442
1443 // ----------------------------------------------------------------------------
1444 // conversions without endianness change
1445 // ----------------------------------------------------------------------------
1446
1447 size_t
1448 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1449 const char *src, size_t srcLen) const
1450 {
1451 // use memcpy() as it should be much faster than hand-written loop
1452 srcLen = GetLength(src, srcLen);
1453 if ( srcLen == wxNO_LEN )
1454 return wxCONV_FAILED;
1455
1456 const size_t inLen = srcLen/BYTES_PER_CHAR;
1457 if ( dst )
1458 {
1459 if ( dstLen < inLen )
1460 return wxCONV_FAILED;
1461
1462 memcpy(dst, src, srcLen);
1463 }
1464
1465 return inLen;
1466 }
1467
1468 size_t
1469 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1470 const wchar_t *src, size_t srcLen) const
1471 {
1472 if ( srcLen == wxNO_LEN )
1473 srcLen = wxWcslen(src) + 1;
1474
1475 srcLen *= BYTES_PER_CHAR;
1476
1477 if ( dst )
1478 {
1479 if ( dstLen < srcLen )
1480 return wxCONV_FAILED;
1481
1482 memcpy(dst, src, srcLen);
1483 }
1484
1485 return srcLen;
1486 }
1487
1488 // ----------------------------------------------------------------------------
1489 // endian-reversing conversions
1490 // ----------------------------------------------------------------------------
1491
1492 size_t
1493 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1494 const char *src, size_t srcLen) const
1495 {
1496 srcLen = GetLength(src, srcLen);
1497 if ( srcLen == wxNO_LEN )
1498 return wxCONV_FAILED;
1499
1500 srcLen /= BYTES_PER_CHAR;
1501
1502 if ( dst )
1503 {
1504 if ( dstLen < srcLen )
1505 return wxCONV_FAILED;
1506
1507 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1508 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1509 {
1510 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1511 }
1512 }
1513
1514 return srcLen;
1515 }
1516
1517 size_t
1518 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1519 const wchar_t *src, size_t srcLen) const
1520 {
1521 if ( srcLen == wxNO_LEN )
1522 srcLen = wxWcslen(src) + 1;
1523
1524 srcLen *= BYTES_PER_CHAR;
1525
1526 if ( dst )
1527 {
1528 if ( dstLen < srcLen )
1529 return wxCONV_FAILED;
1530
1531 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1532 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1533 {
1534 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1535 }
1536 }
1537
1538 return srcLen;
1539 }
1540
1541 #endif // WC_UTF16/!WC_UTF16
1542
1543
1544 // ============================================================================
1545 // The classes doing conversion using the iconv_xxx() functions
1546 // ============================================================================
1547
1548 #ifdef HAVE_ICONV
1549
1550 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1551 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1552 // (unless there's yet another bug in glibc) the only case when iconv()
1553 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1554 // left in the input buffer -- when _real_ error occurs,
1555 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1556 // iconv() failure.
1557 // [This bug does not appear in glibc 2.2.]
1558 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1559 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1560 (errno != E2BIG || bufLeft != 0))
1561 #else
1562 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1563 #endif
1564
1565 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1566
1567 #define ICONV_T_INVALID ((iconv_t)-1)
1568
1569 #if SIZEOF_WCHAR_T == 4
1570 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1571 #define WC_ENC wxFONTENCODING_UTF32
1572 #elif SIZEOF_WCHAR_T == 2
1573 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1574 #define WC_ENC wxFONTENCODING_UTF16
1575 #else // sizeof(wchar_t) != 2 nor 4
1576 // does this ever happen?
1577 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1578 #endif
1579
1580 // ----------------------------------------------------------------------------
1581 // wxMBConv_iconv: encapsulates an iconv character set
1582 // ----------------------------------------------------------------------------
1583
1584 class wxMBConv_iconv : public wxMBConv
1585 {
1586 public:
1587 wxMBConv_iconv(const wxChar *name);
1588 virtual ~wxMBConv_iconv();
1589
1590 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1591 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1592
1593 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1594 virtual size_t GetMBNulLen() const;
1595
1596 virtual wxMBConv *Clone() const
1597 {
1598 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1599 p->m_minMBCharWidth = m_minMBCharWidth;
1600 return p;
1601 }
1602
1603 bool IsOk() const
1604 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1605
1606 protected:
1607 // the iconv handlers used to translate from multibyte
1608 // to wide char and in the other direction
1609 iconv_t m2w,
1610 w2m;
1611
1612 #if wxUSE_THREADS
1613 // guards access to m2w and w2m objects
1614 wxMutex m_iconvMutex;
1615 #endif
1616
1617 private:
1618 // the name (for iconv_open()) of a wide char charset -- if none is
1619 // available on this machine, it will remain NULL
1620 static wxString ms_wcCharsetName;
1621
1622 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1623 // different endian-ness than the native one
1624 static bool ms_wcNeedsSwap;
1625
1626
1627 // name of the encoding handled by this conversion
1628 wxString m_name;
1629
1630 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1631 // initially
1632 size_t m_minMBCharWidth;
1633 };
1634
1635 // make the constructor available for unit testing
1636 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1637 {
1638 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1639 if ( !result->IsOk() )
1640 {
1641 delete result;
1642 return 0;
1643 }
1644
1645 return result;
1646 }
1647
1648 wxString wxMBConv_iconv::ms_wcCharsetName;
1649 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1650
1651 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1652 : m_name(name)
1653 {
1654 m_minMBCharWidth = 0;
1655
1656 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1657 // names for the charsets
1658 const wxCharBuffer cname(wxString(name).ToAscii());
1659
1660 // check for charset that represents wchar_t:
1661 if ( ms_wcCharsetName.empty() )
1662 {
1663 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1664
1665 #if wxUSE_FONTMAP
1666 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1667 #else // !wxUSE_FONTMAP
1668 static const wxChar *names_static[] =
1669 {
1670 #if SIZEOF_WCHAR_T == 4
1671 _T("UCS-4"),
1672 #elif SIZEOF_WCHAR_T = 2
1673 _T("UCS-2"),
1674 #endif
1675 NULL
1676 };
1677 const wxChar **names = names_static;
1678 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1679
1680 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1681 {
1682 const wxString nameCS(*names);
1683
1684 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1685 wxString nameXE(nameCS);
1686
1687 #ifdef WORDS_BIGENDIAN
1688 nameXE += _T("BE");
1689 #else // little endian
1690 nameXE += _T("LE");
1691 #endif
1692
1693 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1694 nameXE.c_str());
1695
1696 m2w = iconv_open(nameXE.ToAscii(), cname);
1697 if ( m2w == ICONV_T_INVALID )
1698 {
1699 // try charset w/o bytesex info (e.g. "UCS4")
1700 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1701 nameCS.c_str());
1702 m2w = iconv_open(nameCS.ToAscii(), cname);
1703
1704 // and check for bytesex ourselves:
1705 if ( m2w != ICONV_T_INVALID )
1706 {
1707 char buf[2], *bufPtr;
1708 wchar_t wbuf[2], *wbufPtr;
1709 size_t insz, outsz;
1710 size_t res;
1711
1712 buf[0] = 'A';
1713 buf[1] = 0;
1714 wbuf[0] = 0;
1715 insz = 2;
1716 outsz = SIZEOF_WCHAR_T * 2;
1717 wbufPtr = wbuf;
1718 bufPtr = buf;
1719
1720 res = iconv(
1721 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1722 (char**)&wbufPtr, &outsz);
1723
1724 if (ICONV_FAILED(res, insz))
1725 {
1726 wxLogLastError(wxT("iconv"));
1727 wxLogError(_("Conversion to charset '%s' doesn't work."),
1728 nameCS.c_str());
1729 }
1730 else // ok, can convert to this encoding, remember it
1731 {
1732 ms_wcCharsetName = nameCS;
1733 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1734 }
1735 }
1736 }
1737 else // use charset not requiring byte swapping
1738 {
1739 ms_wcCharsetName = nameXE;
1740 }
1741 }
1742
1743 wxLogTrace(TRACE_STRCONV,
1744 wxT("iconv wchar_t charset is \"%s\"%s"),
1745 ms_wcCharsetName.empty() ? _T("<none>")
1746 : ms_wcCharsetName.c_str(),
1747 ms_wcNeedsSwap ? _T(" (needs swap)")
1748 : _T(""));
1749 }
1750 else // we already have ms_wcCharsetName
1751 {
1752 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1753 }
1754
1755 if ( ms_wcCharsetName.empty() )
1756 {
1757 w2m = ICONV_T_INVALID;
1758 }
1759 else
1760 {
1761 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1762 if ( w2m == ICONV_T_INVALID )
1763 {
1764 wxLogTrace(TRACE_STRCONV,
1765 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1766 ms_wcCharsetName.c_str(), cname.data());
1767 }
1768 }
1769 }
1770
1771 wxMBConv_iconv::~wxMBConv_iconv()
1772 {
1773 if ( m2w != ICONV_T_INVALID )
1774 iconv_close(m2w);
1775 if ( w2m != ICONV_T_INVALID )
1776 iconv_close(w2m);
1777 }
1778
1779 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1780 {
1781 // find the string length: notice that must be done differently for
1782 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1783 size_t inbuf;
1784 const size_t nulLen = GetMBNulLen();
1785 switch ( nulLen )
1786 {
1787 default:
1788 return wxCONV_FAILED;
1789
1790 case 1:
1791 inbuf = strlen(psz); // arguably more optimized than our version
1792 break;
1793
1794 case 2:
1795 case 4:
1796 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1797 // they also have to start at character boundary and not span two
1798 // adjacent characters
1799 const char *p;
1800 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1801 ;
1802 inbuf = p - psz;
1803 break;
1804 }
1805
1806 #if wxUSE_THREADS
1807 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1808 // Unfortunately there are a couple of global wxCSConv objects such as
1809 // wxConvLocal that are used all over wx code, so we have to make sure
1810 // the handle is used by at most one thread at the time. Otherwise
1811 // only a few wx classes would be safe to use from non-main threads
1812 // as MB<->WC conversion would fail "randomly".
1813 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1814 #endif // wxUSE_THREADS
1815
1816 size_t outbuf = n * SIZEOF_WCHAR_T;
1817 size_t res, cres;
1818 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1819 wchar_t *bufPtr = buf;
1820 const char *pszPtr = psz;
1821
1822 if (buf)
1823 {
1824 // have destination buffer, convert there
1825 cres = iconv(m2w,
1826 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1827 (char**)&bufPtr, &outbuf);
1828 res = n - (outbuf / SIZEOF_WCHAR_T);
1829
1830 if (ms_wcNeedsSwap)
1831 {
1832 // convert to native endianness
1833 for ( unsigned i = 0; i < res; i++ )
1834 buf[n] = WC_BSWAP(buf[i]);
1835 }
1836
1837 // NUL-terminate the string if there is any space left
1838 if (res < n)
1839 buf[res] = 0;
1840 }
1841 else
1842 {
1843 // no destination buffer... convert using temp buffer
1844 // to calculate destination buffer requirement
1845 wchar_t tbuf[8];
1846 res = 0;
1847
1848 do
1849 {
1850 bufPtr = tbuf;
1851 outbuf = 8 * SIZEOF_WCHAR_T;
1852
1853 cres = iconv(m2w,
1854 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1855 (char**)&bufPtr, &outbuf );
1856
1857 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1858 }
1859 while ((cres == (size_t)-1) && (errno == E2BIG));
1860 }
1861
1862 if (ICONV_FAILED(cres, inbuf))
1863 {
1864 //VS: it is ok if iconv fails, hence trace only
1865 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1866 return wxCONV_FAILED;
1867 }
1868
1869 return res;
1870 }
1871
1872 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1873 {
1874 #if wxUSE_THREADS
1875 // NB: explained in MB2WC
1876 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1877 #endif
1878
1879 size_t inlen = wxWcslen(psz);
1880 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1881 size_t outbuf = n;
1882 size_t res, cres;
1883
1884 wchar_t *tmpbuf = 0;
1885
1886 if (ms_wcNeedsSwap)
1887 {
1888 // need to copy to temp buffer to switch endianness
1889 // (doing WC_BSWAP twice on the original buffer won't help, as it
1890 // could be in read-only memory, or be accessed in some other thread)
1891 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1892 for ( size_t i = 0; i < inlen; i++ )
1893 tmpbuf[n] = WC_BSWAP(psz[i]);
1894
1895 tmpbuf[inlen] = L'\0';
1896 psz = tmpbuf;
1897 }
1898
1899 if (buf)
1900 {
1901 // have destination buffer, convert there
1902 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1903
1904 res = n - outbuf;
1905
1906 // NB: iconv was given only wcslen(psz) characters on input, and so
1907 // it couldn't convert the trailing zero. Let's do it ourselves
1908 // if there's some room left for it in the output buffer.
1909 if (res < n)
1910 buf[0] = 0;
1911 }
1912 else
1913 {
1914 // no destination buffer: convert using temp buffer
1915 // to calculate destination buffer requirement
1916 char tbuf[16];
1917 res = 0;
1918 do
1919 {
1920 buf = tbuf;
1921 outbuf = 16;
1922
1923 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1924
1925 res += 16 - outbuf;
1926 }
1927 while ((cres == (size_t)-1) && (errno == E2BIG));
1928 }
1929
1930 if (ms_wcNeedsSwap)
1931 {
1932 free(tmpbuf);
1933 }
1934
1935 if (ICONV_FAILED(cres, inbuf))
1936 {
1937 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1938 return wxCONV_FAILED;
1939 }
1940
1941 return res;
1942 }
1943
1944 size_t wxMBConv_iconv::GetMBNulLen() const
1945 {
1946 if ( m_minMBCharWidth == 0 )
1947 {
1948 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1949
1950 #if wxUSE_THREADS
1951 // NB: explained in MB2WC
1952 wxMutexLocker lock(self->m_iconvMutex);
1953 #endif
1954
1955 wchar_t *wnul = L"";
1956 char buf[8]; // should be enough for NUL in any encoding
1957 size_t inLen = sizeof(wchar_t),
1958 outLen = WXSIZEOF(buf);
1959 char *inBuff = (char *)wnul;
1960 char *outBuff = buf;
1961 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1962 {
1963 self->m_minMBCharWidth = (size_t)-1;
1964 }
1965 else // ok
1966 {
1967 self->m_minMBCharWidth = outBuff - buf;
1968 }
1969 }
1970
1971 return m_minMBCharWidth;
1972 }
1973
1974 #endif // HAVE_ICONV
1975
1976
1977 // ============================================================================
1978 // Win32 conversion classes
1979 // ============================================================================
1980
1981 #ifdef wxHAVE_WIN32_MB2WC
1982
1983 // from utils.cpp
1984 #if wxUSE_FONTMAP
1985 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1986 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1987 #endif
1988
1989 class wxMBConv_win32 : public wxMBConv
1990 {
1991 public:
1992 wxMBConv_win32()
1993 {
1994 m_CodePage = CP_ACP;
1995 m_minMBCharWidth = 0;
1996 }
1997
1998 wxMBConv_win32(const wxMBConv_win32& conv)
1999 : wxMBConv()
2000 {
2001 m_CodePage = conv.m_CodePage;
2002 m_minMBCharWidth = conv.m_minMBCharWidth;
2003 }
2004
2005 #if wxUSE_FONTMAP
2006 wxMBConv_win32(const wxChar* name)
2007 {
2008 m_CodePage = wxCharsetToCodepage(name);
2009 m_minMBCharWidth = 0;
2010 }
2011
2012 wxMBConv_win32(wxFontEncoding encoding)
2013 {
2014 m_CodePage = wxEncodingToCodepage(encoding);
2015 m_minMBCharWidth = 0;
2016 }
2017 #endif // wxUSE_FONTMAP
2018
2019 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2020 {
2021 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2022 // the behaviour is not compatible with the Unix version (using iconv)
2023 // and break the library itself, e.g. wxTextInputStream::NextChar()
2024 // wouldn't work if reading an incomplete MB char didn't result in an
2025 // error
2026 //
2027 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2028 // Win XP or newer and it is not supported for UTF-[78] so we always
2029 // use our own conversions in this case. See
2030 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2031 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2032 if ( m_CodePage == CP_UTF8 )
2033 {
2034 return wxConvUTF8.MB2WC(buf, psz, n);
2035 }
2036
2037 if ( m_CodePage == CP_UTF7 )
2038 {
2039 return wxConvUTF7.MB2WC(buf, psz, n);
2040 }
2041
2042 int flags = 0;
2043 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2044 IsAtLeastWin2kSP4() )
2045 {
2046 flags = MB_ERR_INVALID_CHARS;
2047 }
2048
2049 const size_t len = ::MultiByteToWideChar
2050 (
2051 m_CodePage, // code page
2052 flags, // flags: fall on error
2053 psz, // input string
2054 -1, // its length (NUL-terminated)
2055 buf, // output string
2056 buf ? n : 0 // size of output buffer
2057 );
2058 if ( !len )
2059 {
2060 // function totally failed
2061 return wxCONV_FAILED;
2062 }
2063
2064 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2065 // check if we succeeded, by doing a double trip:
2066 if ( !flags && buf )
2067 {
2068 const size_t mbLen = strlen(psz);
2069 wxCharBuffer mbBuf(mbLen);
2070 if ( ::WideCharToMultiByte
2071 (
2072 m_CodePage,
2073 0,
2074 buf,
2075 -1,
2076 mbBuf.data(),
2077 mbLen + 1, // size in bytes, not length
2078 NULL,
2079 NULL
2080 ) == 0 ||
2081 strcmp(mbBuf, psz) != 0 )
2082 {
2083 // we didn't obtain the same thing we started from, hence
2084 // the conversion was lossy and we consider that it failed
2085 return wxCONV_FAILED;
2086 }
2087 }
2088
2089 // note that it returns count of written chars for buf != NULL and size
2090 // of the needed buffer for buf == NULL so in either case the length of
2091 // the string (which never includes the terminating NUL) is one less
2092 return len - 1;
2093 }
2094
2095 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2096 {
2097 /*
2098 we have a problem here: by default, WideCharToMultiByte() may
2099 replace characters unrepresentable in the target code page with bad
2100 quality approximations such as turning "1/2" symbol (U+00BD) into
2101 "1" for the code pages which don't have it and we, obviously, want
2102 to avoid this at any price
2103
2104 the trouble is that this function does it _silently_, i.e. it won't
2105 even tell us whether it did or not... Win98/2000 and higher provide
2106 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2107 we have to resort to a round trip, i.e. check that converting back
2108 results in the same string -- this is, of course, expensive but
2109 otherwise we simply can't be sure to not garble the data.
2110 */
2111
2112 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2113 // it doesn't work with CJK encodings (which we test for rather roughly
2114 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2115 // supporting it
2116 BOOL usedDef wxDUMMY_INITIALIZE(false);
2117 BOOL *pUsedDef;
2118 int flags;
2119 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2120 {
2121 // it's our lucky day
2122 flags = WC_NO_BEST_FIT_CHARS;
2123 pUsedDef = &usedDef;
2124 }
2125 else // old system or unsupported encoding
2126 {
2127 flags = 0;
2128 pUsedDef = NULL;
2129 }
2130
2131 const size_t len = ::WideCharToMultiByte
2132 (
2133 m_CodePage, // code page
2134 flags, // either none or no best fit
2135 pwz, // input string
2136 -1, // it is (wide) NUL-terminated
2137 buf, // output buffer
2138 buf ? n : 0, // and its size
2139 NULL, // default "replacement" char
2140 pUsedDef // [out] was it used?
2141 );
2142
2143 if ( !len )
2144 {
2145 // function totally failed
2146 return wxCONV_FAILED;
2147 }
2148
2149 // if we were really converting, check if we succeeded
2150 if ( buf )
2151 {
2152 if ( flags )
2153 {
2154 // check if the conversion failed, i.e. if any replacements
2155 // were done
2156 if ( usedDef )
2157 return wxCONV_FAILED;
2158 }
2159 else // we must resort to double tripping...
2160 {
2161 wxWCharBuffer wcBuf(n);
2162 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2163 wcscmp(wcBuf, pwz) != 0 )
2164 {
2165 // we didn't obtain the same thing we started from, hence
2166 // the conversion was lossy and we consider that it failed
2167 return wxCONV_FAILED;
2168 }
2169 }
2170 }
2171
2172 // see the comment above for the reason of "len - 1"
2173 return len - 1;
2174 }
2175
2176 virtual size_t GetMBNulLen() const
2177 {
2178 if ( m_minMBCharWidth == 0 )
2179 {
2180 int len = ::WideCharToMultiByte
2181 (
2182 m_CodePage, // code page
2183 0, // no flags
2184 L"", // input string
2185 1, // translate just the NUL
2186 NULL, // output buffer
2187 0, // and its size
2188 NULL, // no replacement char
2189 NULL // [out] don't care if it was used
2190 );
2191
2192 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2193 switch ( len )
2194 {
2195 default:
2196 wxLogDebug(_T("Unexpected NUL length %d"), len);
2197 self->m_minMBCharWidth = (size_t)-1;
2198 break;
2199
2200 case 0:
2201 self->m_minMBCharWidth = (size_t)-1;
2202 break;
2203
2204 case 1:
2205 case 2:
2206 case 4:
2207 self->m_minMBCharWidth = len;
2208 break;
2209 }
2210 }
2211
2212 return m_minMBCharWidth;
2213 }
2214
2215 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2216
2217 bool IsOk() const { return m_CodePage != -1; }
2218
2219 private:
2220 static bool CanUseNoBestFit()
2221 {
2222 static int s_isWin98Or2k = -1;
2223
2224 if ( s_isWin98Or2k == -1 )
2225 {
2226 int verMaj, verMin;
2227 switch ( wxGetOsVersion(&verMaj, &verMin) )
2228 {
2229 case wxOS_WINDOWS_9X:
2230 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2231 break;
2232
2233 case wxOS_WINDOWS_NT:
2234 s_isWin98Or2k = verMaj >= 5;
2235 break;
2236
2237 default:
2238 // unknown: be conservative by default
2239 s_isWin98Or2k = 0;
2240 break;
2241 }
2242
2243 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2244 }
2245
2246 return s_isWin98Or2k == 1;
2247 }
2248
2249 static bool IsAtLeastWin2kSP4()
2250 {
2251 #ifdef __WXWINCE__
2252 return false;
2253 #else
2254 static int s_isAtLeastWin2kSP4 = -1;
2255
2256 if ( s_isAtLeastWin2kSP4 == -1 )
2257 {
2258 OSVERSIONINFOEX ver;
2259
2260 memset(&ver, 0, sizeof(ver));
2261 ver.dwOSVersionInfoSize = sizeof(ver);
2262 GetVersionEx((OSVERSIONINFO*)&ver);
2263
2264 s_isAtLeastWin2kSP4 =
2265 ((ver.dwMajorVersion > 5) || // Vista+
2266 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2267 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2268 ver.wServicePackMajor >= 4)) // 2000 SP4+
2269 ? 1 : 0;
2270 }
2271
2272 return s_isAtLeastWin2kSP4 == 1;
2273 #endif
2274 }
2275
2276
2277 // the code page we're working with
2278 long m_CodePage;
2279
2280 // cached result of GetMBNulLen(), set to 0 initially meaning
2281 // "unknown"
2282 size_t m_minMBCharWidth;
2283 };
2284
2285 #endif // wxHAVE_WIN32_MB2WC
2286
2287 // ============================================================================
2288 // Cocoa conversion classes
2289 // ============================================================================
2290
2291 #if defined(__WXCOCOA__)
2292
2293 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2294 // Strangely enough, internally Core Foundation uses
2295 // UTF-32 internally quite a bit - its just not public (yet).
2296
2297 #include <CoreFoundation/CFString.h>
2298 #include <CoreFoundation/CFStringEncodingExt.h>
2299
2300 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2301 {
2302 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2303
2304 switch (encoding)
2305 {
2306 case wxFONTENCODING_DEFAULT :
2307 enc = CFStringGetSystemEncoding();
2308 break ;
2309
2310 case wxFONTENCODING_ISO8859_1 :
2311 enc = kCFStringEncodingISOLatin1 ;
2312 break ;
2313 case wxFONTENCODING_ISO8859_2 :
2314 enc = kCFStringEncodingISOLatin2;
2315 break ;
2316 case wxFONTENCODING_ISO8859_3 :
2317 enc = kCFStringEncodingISOLatin3 ;
2318 break ;
2319 case wxFONTENCODING_ISO8859_4 :
2320 enc = kCFStringEncodingISOLatin4;
2321 break ;
2322 case wxFONTENCODING_ISO8859_5 :
2323 enc = kCFStringEncodingISOLatinCyrillic;
2324 break ;
2325 case wxFONTENCODING_ISO8859_6 :
2326 enc = kCFStringEncodingISOLatinArabic;
2327 break ;
2328 case wxFONTENCODING_ISO8859_7 :
2329 enc = kCFStringEncodingISOLatinGreek;
2330 break ;
2331 case wxFONTENCODING_ISO8859_8 :
2332 enc = kCFStringEncodingISOLatinHebrew;
2333 break ;
2334 case wxFONTENCODING_ISO8859_9 :
2335 enc = kCFStringEncodingISOLatin5;
2336 break ;
2337 case wxFONTENCODING_ISO8859_10 :
2338 enc = kCFStringEncodingISOLatin6;
2339 break ;
2340 case wxFONTENCODING_ISO8859_11 :
2341 enc = kCFStringEncodingISOLatinThai;
2342 break ;
2343 case wxFONTENCODING_ISO8859_13 :
2344 enc = kCFStringEncodingISOLatin7;
2345 break ;
2346 case wxFONTENCODING_ISO8859_14 :
2347 enc = kCFStringEncodingISOLatin8;
2348 break ;
2349 case wxFONTENCODING_ISO8859_15 :
2350 enc = kCFStringEncodingISOLatin9;
2351 break ;
2352
2353 case wxFONTENCODING_KOI8 :
2354 enc = kCFStringEncodingKOI8_R;
2355 break ;
2356 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2357 enc = kCFStringEncodingDOSRussian;
2358 break ;
2359
2360 // case wxFONTENCODING_BULGARIAN :
2361 // enc = ;
2362 // break ;
2363
2364 case wxFONTENCODING_CP437 :
2365 enc = kCFStringEncodingDOSLatinUS ;
2366 break ;
2367 case wxFONTENCODING_CP850 :
2368 enc = kCFStringEncodingDOSLatin1;
2369 break ;
2370 case wxFONTENCODING_CP852 :
2371 enc = kCFStringEncodingDOSLatin2;
2372 break ;
2373 case wxFONTENCODING_CP855 :
2374 enc = kCFStringEncodingDOSCyrillic;
2375 break ;
2376 case wxFONTENCODING_CP866 :
2377 enc = kCFStringEncodingDOSRussian ;
2378 break ;
2379 case wxFONTENCODING_CP874 :
2380 enc = kCFStringEncodingDOSThai;
2381 break ;
2382 case wxFONTENCODING_CP932 :
2383 enc = kCFStringEncodingDOSJapanese;
2384 break ;
2385 case wxFONTENCODING_CP936 :
2386 enc = kCFStringEncodingDOSChineseSimplif ;
2387 break ;
2388 case wxFONTENCODING_CP949 :
2389 enc = kCFStringEncodingDOSKorean;
2390 break ;
2391 case wxFONTENCODING_CP950 :
2392 enc = kCFStringEncodingDOSChineseTrad;
2393 break ;
2394 case wxFONTENCODING_CP1250 :
2395 enc = kCFStringEncodingWindowsLatin2;
2396 break ;
2397 case wxFONTENCODING_CP1251 :
2398 enc = kCFStringEncodingWindowsCyrillic ;
2399 break ;
2400 case wxFONTENCODING_CP1252 :
2401 enc = kCFStringEncodingWindowsLatin1 ;
2402 break ;
2403 case wxFONTENCODING_CP1253 :
2404 enc = kCFStringEncodingWindowsGreek;
2405 break ;
2406 case wxFONTENCODING_CP1254 :
2407 enc = kCFStringEncodingWindowsLatin5;
2408 break ;
2409 case wxFONTENCODING_CP1255 :
2410 enc = kCFStringEncodingWindowsHebrew ;
2411 break ;
2412 case wxFONTENCODING_CP1256 :
2413 enc = kCFStringEncodingWindowsArabic ;
2414 break ;
2415 case wxFONTENCODING_CP1257 :
2416 enc = kCFStringEncodingWindowsBalticRim;
2417 break ;
2418 // This only really encodes to UTF7 (if that) evidently
2419 // case wxFONTENCODING_UTF7 :
2420 // enc = kCFStringEncodingNonLossyASCII ;
2421 // break ;
2422 case wxFONTENCODING_UTF8 :
2423 enc = kCFStringEncodingUTF8 ;
2424 break ;
2425 case wxFONTENCODING_EUC_JP :
2426 enc = kCFStringEncodingEUC_JP;
2427 break ;
2428 case wxFONTENCODING_UTF16 :
2429 enc = kCFStringEncodingUnicode ;
2430 break ;
2431 case wxFONTENCODING_MACROMAN :
2432 enc = kCFStringEncodingMacRoman ;
2433 break ;
2434 case wxFONTENCODING_MACJAPANESE :
2435 enc = kCFStringEncodingMacJapanese ;
2436 break ;
2437 case wxFONTENCODING_MACCHINESETRAD :
2438 enc = kCFStringEncodingMacChineseTrad ;
2439 break ;
2440 case wxFONTENCODING_MACKOREAN :
2441 enc = kCFStringEncodingMacKorean ;
2442 break ;
2443 case wxFONTENCODING_MACARABIC :
2444 enc = kCFStringEncodingMacArabic ;
2445 break ;
2446 case wxFONTENCODING_MACHEBREW :
2447 enc = kCFStringEncodingMacHebrew ;
2448 break ;
2449 case wxFONTENCODING_MACGREEK :
2450 enc = kCFStringEncodingMacGreek ;
2451 break ;
2452 case wxFONTENCODING_MACCYRILLIC :
2453 enc = kCFStringEncodingMacCyrillic ;
2454 break ;
2455 case wxFONTENCODING_MACDEVANAGARI :
2456 enc = kCFStringEncodingMacDevanagari ;
2457 break ;
2458 case wxFONTENCODING_MACGURMUKHI :
2459 enc = kCFStringEncodingMacGurmukhi ;
2460 break ;
2461 case wxFONTENCODING_MACGUJARATI :
2462 enc = kCFStringEncodingMacGujarati ;
2463 break ;
2464 case wxFONTENCODING_MACORIYA :
2465 enc = kCFStringEncodingMacOriya ;
2466 break ;
2467 case wxFONTENCODING_MACBENGALI :
2468 enc = kCFStringEncodingMacBengali ;
2469 break ;
2470 case wxFONTENCODING_MACTAMIL :
2471 enc = kCFStringEncodingMacTamil ;
2472 break ;
2473 case wxFONTENCODING_MACTELUGU :
2474 enc = kCFStringEncodingMacTelugu ;
2475 break ;
2476 case wxFONTENCODING_MACKANNADA :
2477 enc = kCFStringEncodingMacKannada ;
2478 break ;
2479 case wxFONTENCODING_MACMALAJALAM :
2480 enc = kCFStringEncodingMacMalayalam ;
2481 break ;
2482 case wxFONTENCODING_MACSINHALESE :
2483 enc = kCFStringEncodingMacSinhalese ;
2484 break ;
2485 case wxFONTENCODING_MACBURMESE :
2486 enc = kCFStringEncodingMacBurmese ;
2487 break ;
2488 case wxFONTENCODING_MACKHMER :
2489 enc = kCFStringEncodingMacKhmer ;
2490 break ;
2491 case wxFONTENCODING_MACTHAI :
2492 enc = kCFStringEncodingMacThai ;
2493 break ;
2494 case wxFONTENCODING_MACLAOTIAN :
2495 enc = kCFStringEncodingMacLaotian ;
2496 break ;
2497 case wxFONTENCODING_MACGEORGIAN :
2498 enc = kCFStringEncodingMacGeorgian ;
2499 break ;
2500 case wxFONTENCODING_MACARMENIAN :
2501 enc = kCFStringEncodingMacArmenian ;
2502 break ;
2503 case wxFONTENCODING_MACCHINESESIMP :
2504 enc = kCFStringEncodingMacChineseSimp ;
2505 break ;
2506 case wxFONTENCODING_MACTIBETAN :
2507 enc = kCFStringEncodingMacTibetan ;
2508 break ;
2509 case wxFONTENCODING_MACMONGOLIAN :
2510 enc = kCFStringEncodingMacMongolian ;
2511 break ;
2512 case wxFONTENCODING_MACETHIOPIC :
2513 enc = kCFStringEncodingMacEthiopic ;
2514 break ;
2515 case wxFONTENCODING_MACCENTRALEUR :
2516 enc = kCFStringEncodingMacCentralEurRoman ;
2517 break ;
2518 case wxFONTENCODING_MACVIATNAMESE :
2519 enc = kCFStringEncodingMacVietnamese ;
2520 break ;
2521 case wxFONTENCODING_MACARABICEXT :
2522 enc = kCFStringEncodingMacExtArabic ;
2523 break ;
2524 case wxFONTENCODING_MACSYMBOL :
2525 enc = kCFStringEncodingMacSymbol ;
2526 break ;
2527 case wxFONTENCODING_MACDINGBATS :
2528 enc = kCFStringEncodingMacDingbats ;
2529 break ;
2530 case wxFONTENCODING_MACTURKISH :
2531 enc = kCFStringEncodingMacTurkish ;
2532 break ;
2533 case wxFONTENCODING_MACCROATIAN :
2534 enc = kCFStringEncodingMacCroatian ;
2535 break ;
2536 case wxFONTENCODING_MACICELANDIC :
2537 enc = kCFStringEncodingMacIcelandic ;
2538 break ;
2539 case wxFONTENCODING_MACROMANIAN :
2540 enc = kCFStringEncodingMacRomanian ;
2541 break ;
2542 case wxFONTENCODING_MACCELTIC :
2543 enc = kCFStringEncodingMacCeltic ;
2544 break ;
2545 case wxFONTENCODING_MACGAELIC :
2546 enc = kCFStringEncodingMacGaelic ;
2547 break ;
2548 // case wxFONTENCODING_MACKEYBOARD :
2549 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2550 // break ;
2551
2552 default :
2553 // because gcc is picky
2554 break ;
2555 }
2556
2557 return enc ;
2558 }
2559
2560 class wxMBConv_cocoa : public wxMBConv
2561 {
2562 public:
2563 wxMBConv_cocoa()
2564 {
2565 Init(CFStringGetSystemEncoding()) ;
2566 }
2567
2568 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2569 {
2570 m_encoding = conv.m_encoding;
2571 }
2572
2573 #if wxUSE_FONTMAP
2574 wxMBConv_cocoa(const wxChar* name)
2575 {
2576 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2577 }
2578 #endif
2579
2580 wxMBConv_cocoa(wxFontEncoding encoding)
2581 {
2582 Init( wxCFStringEncFromFontEnc(encoding) );
2583 }
2584
2585 virtual ~wxMBConv_cocoa()
2586 {
2587 }
2588
2589 void Init( CFStringEncoding encoding)
2590 {
2591 m_encoding = encoding ;
2592 }
2593
2594 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2595 {
2596 wxASSERT(szUnConv);
2597
2598 CFStringRef theString = CFStringCreateWithBytes (
2599 NULL, //the allocator
2600 (const UInt8*)szUnConv,
2601 strlen(szUnConv),
2602 m_encoding,
2603 false //no BOM/external representation
2604 );
2605
2606 wxASSERT(theString);
2607
2608 size_t nOutLength = CFStringGetLength(theString);
2609
2610 if (szOut == NULL)
2611 {
2612 CFRelease(theString);
2613 return nOutLength;
2614 }
2615
2616 CFRange theRange = { 0, nOutSize };
2617
2618 #if SIZEOF_WCHAR_T == 4
2619 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2620 #endif
2621
2622 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2623
2624 CFRelease(theString);
2625
2626 szUniCharBuffer[nOutLength] = '\0';
2627
2628 #if SIZEOF_WCHAR_T == 4
2629 wxMBConvUTF16 converter;
2630 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2631 delete [] szUniCharBuffer;
2632 #endif
2633
2634 return nOutLength;
2635 }
2636
2637 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2638 {
2639 wxASSERT(szUnConv);
2640
2641 size_t nRealOutSize;
2642 size_t nBufSize = wxWcslen(szUnConv);
2643 UniChar* szUniBuffer = (UniChar*) szUnConv;
2644
2645 #if SIZEOF_WCHAR_T == 4
2646 wxMBConvUTF16 converter ;
2647 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2648 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2649 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2650 nBufSize /= sizeof(UniChar);
2651 #endif
2652
2653 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2654 NULL, //allocator
2655 szUniBuffer,
2656 nBufSize,
2657 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2658 );
2659
2660 wxASSERT(theString);
2661
2662 //Note that CER puts a BOM when converting to unicode
2663 //so we check and use getchars instead in that case
2664 if (m_encoding == kCFStringEncodingUnicode)
2665 {
2666 if (szOut != NULL)
2667 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2668
2669 nRealOutSize = CFStringGetLength(theString) + 1;
2670 }
2671 else
2672 {
2673 CFStringGetBytes(
2674 theString,
2675 CFRangeMake(0, CFStringGetLength(theString)),
2676 m_encoding,
2677 0, //what to put in characters that can't be converted -
2678 //0 tells CFString to return NULL if it meets such a character
2679 false, //not an external representation
2680 (UInt8*) szOut,
2681 nOutSize,
2682 (CFIndex*) &nRealOutSize
2683 );
2684 }
2685
2686 CFRelease(theString);
2687
2688 #if SIZEOF_WCHAR_T == 4
2689 delete[] szUniBuffer;
2690 #endif
2691
2692 return nRealOutSize - 1;
2693 }
2694
2695 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2696
2697 bool IsOk() const
2698 {
2699 return m_encoding != kCFStringEncodingInvalidId &&
2700 CFStringIsEncodingAvailable(m_encoding);
2701 }
2702
2703 private:
2704 CFStringEncoding m_encoding ;
2705 };
2706
2707 #endif // defined(__WXCOCOA__)
2708
2709 // ============================================================================
2710 // Mac conversion classes
2711 // ============================================================================
2712
2713 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2714
2715 class wxMBConv_mac : public wxMBConv
2716 {
2717 public:
2718 wxMBConv_mac()
2719 {
2720 Init(CFStringGetSystemEncoding()) ;
2721 }
2722
2723 wxMBConv_mac(const wxMBConv_mac& conv)
2724 {
2725 Init(conv.m_char_encoding);
2726 }
2727
2728 #if wxUSE_FONTMAP
2729 wxMBConv_mac(const wxChar* name)
2730 {
2731 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2732 }
2733 #endif
2734
2735 wxMBConv_mac(wxFontEncoding encoding)
2736 {
2737 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2738 }
2739
2740 virtual ~wxMBConv_mac()
2741 {
2742 OSStatus status = noErr ;
2743 if (m_MB2WC_converter)
2744 status = TECDisposeConverter(m_MB2WC_converter);
2745 if (m_WC2MB_converter)
2746 status = TECDisposeConverter(m_WC2MB_converter);
2747 }
2748
2749 void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2750 TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2751 {
2752 m_MB2WC_converter = NULL ;
2753 m_WC2MB_converter = NULL ;
2754 m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2755 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2756 }
2757
2758 virtual void CreateIfNeeded() const
2759 {
2760 if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2761 {
2762 OSStatus status = noErr ;
2763 status = TECCreateConverter(&m_MB2WC_converter,
2764 m_char_encoding,
2765 m_unicode_encoding);
2766 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2767 status = TECCreateConverter(&m_WC2MB_converter,
2768 m_unicode_encoding,
2769 m_char_encoding);
2770 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2771 }
2772 }
2773
2774 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2775 {
2776 CreateIfNeeded() ;
2777 OSStatus status = noErr ;
2778 ByteCount byteOutLen ;
2779 ByteCount byteInLen = strlen(psz) + 1;
2780 wchar_t *tbuf = NULL ;
2781 UniChar* ubuf = NULL ;
2782 size_t res = 0 ;
2783
2784 if (buf == NULL)
2785 {
2786 // Apple specs say at least 32
2787 n = wxMax( 32, byteInLen ) ;
2788 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2789 }
2790
2791 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2792
2793 #if SIZEOF_WCHAR_T == 4
2794 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2795 #else
2796 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2797 #endif
2798
2799 status = TECConvertText(
2800 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2801 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2802
2803 #if SIZEOF_WCHAR_T == 4
2804 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2805 // is not properly terminated we get random characters at the end
2806 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2807 wxMBConvUTF16 converter ;
2808 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2809 free( ubuf ) ;
2810 #else
2811 res = byteOutLen / sizeof( UniChar ) ;
2812 #endif
2813
2814 if ( buf == NULL )
2815 free(tbuf) ;
2816
2817 if ( buf && res < n)
2818 buf[res] = 0;
2819
2820 return res ;
2821 }
2822
2823 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2824 {
2825 CreateIfNeeded() ;
2826 OSStatus status = noErr ;
2827 ByteCount byteOutLen ;
2828 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2829
2830 char *tbuf = NULL ;
2831
2832 if (buf == NULL)
2833 {
2834 // Apple specs say at least 32
2835 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2836 tbuf = (char*) malloc( n ) ;
2837 }
2838
2839 ByteCount byteBufferLen = n ;
2840 UniChar* ubuf = NULL ;
2841
2842 #if SIZEOF_WCHAR_T == 4
2843 wxMBConvUTF16 converter ;
2844 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2845 byteInLen = unicharlen ;
2846 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2847 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2848 #else
2849 ubuf = (UniChar*) psz ;
2850 #endif
2851
2852 status = TECConvertText(
2853 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2854 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2855
2856 #if SIZEOF_WCHAR_T == 4
2857 free( ubuf ) ;
2858 #endif
2859
2860 if ( buf == NULL )
2861 free(tbuf) ;
2862
2863 size_t res = byteOutLen ;
2864 if ( buf && res < n)
2865 {
2866 buf[res] = 0;
2867
2868 //we need to double-trip to verify it didn't insert any ? in place
2869 //of bogus characters
2870 wxWCharBuffer wcBuf(n);
2871 size_t pszlen = wxWcslen(psz);
2872 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2873 wxWcslen(wcBuf) != pszlen ||
2874 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2875 {
2876 // we didn't obtain the same thing we started from, hence
2877 // the conversion was lossy and we consider that it failed
2878 return wxCONV_FAILED;
2879 }
2880 }
2881
2882 return res ;
2883 }
2884
2885 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2886
2887 bool IsOk() const
2888 {
2889 CreateIfNeeded() ;
2890 return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2891 }
2892
2893 protected :
2894 mutable TECObjectRef m_MB2WC_converter;
2895 mutable TECObjectRef m_WC2MB_converter;
2896
2897 TextEncodingBase m_char_encoding;
2898 TextEncodingBase m_unicode_encoding;
2899 };
2900
2901 // MB is decomposed (D) normalized UTF8
2902
2903 class wxMBConv_macUTF8D : public wxMBConv_mac
2904 {
2905 public :
2906 wxMBConv_macUTF8D()
2907 {
2908 Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
2909 m_uni = NULL;
2910 m_uniBack = NULL ;
2911 }
2912
2913 virtual ~wxMBConv_macUTF8D()
2914 {
2915 if (m_uni!=NULL)
2916 DisposeUnicodeToTextInfo(&m_uni);
2917 if (m_uniBack!=NULL)
2918 DisposeUnicodeToTextInfo(&m_uniBack);
2919 }
2920
2921 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2922 {
2923 CreateIfNeeded() ;
2924 OSStatus status = noErr ;
2925 ByteCount byteOutLen ;
2926 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2927
2928 char *tbuf = NULL ;
2929
2930 if (buf == NULL)
2931 {
2932 // Apple specs say at least 32
2933 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2934 tbuf = (char*) malloc( n ) ;
2935 }
2936
2937 ByteCount byteBufferLen = n ;
2938 UniChar* ubuf = NULL ;
2939
2940 #if SIZEOF_WCHAR_T == 4
2941 wxMBConvUTF16 converter ;
2942 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2943 byteInLen = unicharlen ;
2944 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2945 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2946 #else
2947 ubuf = (UniChar*) psz ;
2948 #endif
2949
2950 // ubuf is a non-decomposed UniChar buffer
2951
2952 ByteCount dcubuflen = byteInLen * 2 + 2 ;
2953 ByteCount dcubufread , dcubufwritten ;
2954 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2955
2956 ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
2957 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , dcubuf ) ;
2958
2959 // we now convert that decomposed buffer into UTF8
2960
2961 status = TECConvertText(
2962 m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
2963 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2964
2965 free( dcubuf );
2966
2967 #if SIZEOF_WCHAR_T == 4
2968 free( ubuf ) ;
2969 #endif
2970
2971 if ( buf == NULL )
2972 free(tbuf) ;
2973
2974 size_t res = byteOutLen ;
2975 if ( buf && res < n)
2976 {
2977 buf[res] = 0;
2978 // don't test for round-trip fidelity yet, we cannot guarantee it yet
2979 }
2980
2981 return res ;
2982 }
2983
2984 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2985 {
2986 CreateIfNeeded() ;
2987 OSStatus status = noErr ;
2988 ByteCount byteOutLen ;
2989 ByteCount byteInLen = strlen(psz) + 1;
2990 wchar_t *tbuf = NULL ;
2991 UniChar* ubuf = NULL ;
2992 size_t res = 0 ;
2993
2994 if (buf == NULL)
2995 {
2996 // Apple specs say at least 32
2997 n = wxMax( 32, byteInLen ) ;
2998 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2999 }
3000
3001 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
3002
3003 #if SIZEOF_WCHAR_T == 4
3004 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
3005 #else
3006 ubuf = (UniChar*) (buf ? buf : tbuf) ;
3007 #endif
3008
3009 ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
3010 ByteCount dcubufread , dcubufwritten ;
3011 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3012
3013 status = TECConvertText(
3014 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
3015 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
3016 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3017 // is not properly terminated we get random characters at the end
3018 dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3019
3020 // now from the decomposed UniChar to properly composed uniChar
3021 ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
3022 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , ubuf ) ;
3023
3024 free( dcubuf );
3025 byteOutLen = dcubufwritten ;
3026 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3027
3028
3029 #if SIZEOF_WCHAR_T == 4
3030 wxMBConvUTF16 converter ;
3031 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
3032 free( ubuf ) ;
3033 #else
3034 res = byteOutLen / sizeof( UniChar ) ;
3035 #endif
3036
3037 if ( buf == NULL )
3038 free(tbuf) ;
3039
3040 if ( buf && res < n)
3041 buf[res] = 0;
3042
3043 return res ;
3044 }
3045
3046 virtual void CreateIfNeeded() const
3047 {
3048 wxMBConv_mac::CreateIfNeeded() ;
3049 if ( m_uni == NULL )
3050 {
3051 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3052 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3053 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3054 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
3055 m_map.mappingVersion = kUnicodeUseLatestMapping;
3056
3057 OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
3058 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3059
3060 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3061 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3062 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3063 kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
3064 m_map.mappingVersion = kUnicodeUseLatestMapping;
3065 err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
3066 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3067 }
3068 }
3069 protected :
3070 mutable UnicodeToTextInfo m_uni;
3071 mutable UnicodeToTextInfo m_uniBack;
3072 mutable UnicodeMapping m_map;
3073 };
3074 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3075
3076 // ============================================================================
3077 // wxEncodingConverter based conversion classes
3078 // ============================================================================
3079
3080 #if wxUSE_FONTMAP
3081
3082 class wxMBConv_wxwin : public wxMBConv
3083 {
3084 private:
3085 void Init()
3086 {
3087 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
3088 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
3089 }
3090
3091 public:
3092 // temporarily just use wxEncodingConverter stuff,
3093 // so that it works while a better implementation is built
3094 wxMBConv_wxwin(const wxChar* name)
3095 {
3096 if (name)
3097 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3098 else
3099 m_enc = wxFONTENCODING_SYSTEM;
3100
3101 Init();
3102 }
3103
3104 wxMBConv_wxwin(wxFontEncoding enc)
3105 {
3106 m_enc = enc;
3107
3108 Init();
3109 }
3110
3111 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
3112 {
3113 size_t inbuf = strlen(psz);
3114 if (buf)
3115 {
3116 if (!m2w.Convert(psz, buf))
3117 return wxCONV_FAILED;
3118 }
3119 return inbuf;
3120 }
3121
3122 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
3123 {
3124 const size_t inbuf = wxWcslen(psz);
3125 if (buf)
3126 {
3127 if (!w2m.Convert(psz, buf))
3128 return wxCONV_FAILED;
3129 }
3130
3131 return inbuf;
3132 }
3133
3134 virtual size_t GetMBNulLen() const
3135 {
3136 switch ( m_enc )
3137 {
3138 case wxFONTENCODING_UTF16BE:
3139 case wxFONTENCODING_UTF16LE:
3140 return 2;
3141
3142 case wxFONTENCODING_UTF32BE:
3143 case wxFONTENCODING_UTF32LE:
3144 return 4;
3145
3146 default:
3147 return 1;
3148 }
3149 }
3150
3151 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
3152
3153 bool IsOk() const { return m_ok; }
3154
3155 public:
3156 wxFontEncoding m_enc;
3157 wxEncodingConverter m2w, w2m;
3158
3159 private:
3160 // were we initialized successfully?
3161 bool m_ok;
3162
3163 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
3164 };
3165
3166 // make the constructors available for unit testing
3167 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
3168 {
3169 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
3170 if ( !result->IsOk() )
3171 {
3172 delete result;
3173 return 0;
3174 }
3175
3176 return result;
3177 }
3178
3179 #endif // wxUSE_FONTMAP
3180
3181 // ============================================================================
3182 // wxCSConv implementation
3183 // ============================================================================
3184
3185 void wxCSConv::Init()
3186 {
3187 m_name = NULL;
3188 m_convReal = NULL;
3189 m_deferred = true;
3190 }
3191
3192 wxCSConv::wxCSConv(const wxChar *charset)
3193 {
3194 Init();
3195
3196 if ( charset )
3197 {
3198 SetName(charset);
3199 }
3200
3201 #if wxUSE_FONTMAP
3202 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3203 #else
3204 m_encoding = wxFONTENCODING_SYSTEM;
3205 #endif
3206 }
3207
3208 wxCSConv::wxCSConv(wxFontEncoding encoding)
3209 {
3210 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3211 {
3212 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3213
3214 encoding = wxFONTENCODING_SYSTEM;
3215 }
3216
3217 Init();
3218
3219 m_encoding = encoding;
3220 }
3221
3222 wxCSConv::~wxCSConv()
3223 {
3224 Clear();
3225 }
3226
3227 wxCSConv::wxCSConv(const wxCSConv& conv)
3228 : wxMBConv()
3229 {
3230 Init();
3231
3232 SetName(conv.m_name);
3233 m_encoding = conv.m_encoding;
3234 }
3235
3236 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3237 {
3238 Clear();
3239
3240 SetName(conv.m_name);
3241 m_encoding = conv.m_encoding;
3242
3243 return *this;
3244 }
3245
3246 void wxCSConv::Clear()
3247 {
3248 free(m_name);
3249 delete m_convReal;
3250
3251 m_name = NULL;
3252 m_convReal = NULL;
3253 }
3254
3255 void wxCSConv::SetName(const wxChar *charset)
3256 {
3257 if (charset)
3258 {
3259 m_name = wxStrdup(charset);
3260 m_deferred = true;
3261 }
3262 }
3263
3264 #if wxUSE_FONTMAP
3265
3266 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3267 wxEncodingNameCache );
3268
3269 static wxEncodingNameCache gs_nameCache;
3270 #endif
3271
3272 wxMBConv *wxCSConv::DoCreate() const
3273 {
3274 #if wxUSE_FONTMAP
3275 wxLogTrace(TRACE_STRCONV,
3276 wxT("creating conversion for %s"),
3277 (m_name ? m_name
3278 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3279 #endif // wxUSE_FONTMAP
3280
3281 // check for the special case of ASCII or ISO8859-1 charset: as we have
3282 // special knowledge of it anyhow, we don't need to create a special
3283 // conversion object
3284 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3285 m_encoding == wxFONTENCODING_DEFAULT )
3286 {
3287 // don't convert at all
3288 return NULL;
3289 }
3290
3291 // we trust OS to do conversion better than we can so try external
3292 // conversion methods first
3293 //
3294 // the full order is:
3295 // 1. OS conversion (iconv() under Unix or Win32 API)
3296 // 2. hard coded conversions for UTF
3297 // 3. wxEncodingConverter as fall back
3298
3299 // step (1)
3300 #ifdef HAVE_ICONV
3301 #if !wxUSE_FONTMAP
3302 if ( m_name )
3303 #endif // !wxUSE_FONTMAP
3304 {
3305 wxString name(m_name);
3306 #if wxUSE_FONTMAP
3307 wxFontEncoding encoding(m_encoding);
3308 #endif
3309
3310 if ( !name.empty() )
3311 {
3312 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3313 if ( conv->IsOk() )
3314 return conv;
3315
3316 delete conv;
3317
3318 #if wxUSE_FONTMAP
3319 encoding =
3320 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3321 #endif // wxUSE_FONTMAP
3322 }
3323 #if wxUSE_FONTMAP
3324 {
3325 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3326 if ( it != gs_nameCache.end() )
3327 {
3328 if ( it->second.empty() )
3329 return NULL;
3330
3331 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3332 if ( conv->IsOk() )
3333 return conv;
3334
3335 delete conv;
3336 }
3337
3338 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3339 // CS : in case this does not return valid names (eg for MacRoman) encoding
3340 // got a 'failure' entry in the cache all the same, although it just has to
3341 // be created using a different method, so only store failed iconv creation
3342 // attempts (or perhaps we shoulnd't do this at all ?)
3343 if ( names[0] != NULL )
3344 {
3345 for ( ; *names; ++names )
3346 {
3347 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3348 if ( conv->IsOk() )
3349 {
3350 gs_nameCache[encoding] = *names;
3351 return conv;
3352 }
3353
3354 delete conv;
3355 }
3356
3357 gs_nameCache[encoding] = _T(""); // cache the failure
3358 }
3359 }
3360 #endif // wxUSE_FONTMAP
3361 }
3362 #endif // HAVE_ICONV
3363
3364 #ifdef wxHAVE_WIN32_MB2WC
3365 {
3366 #if wxUSE_FONTMAP
3367 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3368 : new wxMBConv_win32(m_encoding);
3369 if ( conv->IsOk() )
3370 return conv;
3371
3372 delete conv;
3373 #else
3374 return NULL;
3375 #endif
3376 }
3377 #endif // wxHAVE_WIN32_MB2WC
3378
3379 #if defined(__WXMAC__)
3380 {
3381 // leave UTF16 and UTF32 to the built-ins of wx
3382 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3383 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3384 {
3385 #if wxUSE_FONTMAP
3386 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3387 : new wxMBConv_mac(m_encoding);
3388 #else
3389 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3390 #endif
3391 if ( conv->IsOk() )
3392 return conv;
3393
3394 delete conv;
3395 }
3396 }
3397 #endif
3398
3399 #if defined(__WXCOCOA__)
3400 {
3401 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3402 {
3403 #if wxUSE_FONTMAP
3404 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3405 : new wxMBConv_cocoa(m_encoding);
3406 #else
3407 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3408 #endif
3409
3410 if ( conv->IsOk() )
3411 return conv;
3412
3413 delete conv;
3414 }
3415 }
3416 #endif
3417 // step (2)
3418 wxFontEncoding enc = m_encoding;
3419 #if wxUSE_FONTMAP
3420 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3421 {
3422 // use "false" to suppress interactive dialogs -- we can be called from
3423 // anywhere and popping up a dialog from here is the last thing we want to
3424 // do
3425 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3426 }
3427 #endif // wxUSE_FONTMAP
3428
3429 switch ( enc )
3430 {
3431 case wxFONTENCODING_UTF7:
3432 return new wxMBConvUTF7;
3433
3434 case wxFONTENCODING_UTF8:
3435 return new wxMBConvUTF8;
3436
3437 case wxFONTENCODING_UTF16BE:
3438 return new wxMBConvUTF16BE;
3439
3440 case wxFONTENCODING_UTF16LE:
3441 return new wxMBConvUTF16LE;
3442
3443 case wxFONTENCODING_UTF32BE:
3444 return new wxMBConvUTF32BE;
3445
3446 case wxFONTENCODING_UTF32LE:
3447 return new wxMBConvUTF32LE;
3448
3449 default:
3450 // nothing to do but put here to suppress gcc warnings
3451 break;
3452 }
3453
3454 // step (3)
3455 #if wxUSE_FONTMAP
3456 {
3457 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3458 : new wxMBConv_wxwin(m_encoding);
3459 if ( conv->IsOk() )
3460 return conv;
3461
3462 delete conv;
3463 }
3464 #endif // wxUSE_FONTMAP
3465
3466 // NB: This is a hack to prevent deadlock. What could otherwise happen
3467 // in Unicode build: wxConvLocal creation ends up being here
3468 // because of some failure and logs the error. But wxLog will try to
3469 // attach a timestamp, for which it will need wxConvLocal (to convert
3470 // time to char* and then wchar_t*), but that fails, tries to log the
3471 // error, but wxLog has an (already locked) critical section that
3472 // guards the static buffer.
3473 static bool alreadyLoggingError = false;
3474 if (!alreadyLoggingError)
3475 {
3476 alreadyLoggingError = true;
3477 wxLogError(_("Cannot convert from the charset '%s'!"),
3478 m_name ? m_name
3479 :
3480 #if wxUSE_FONTMAP
3481 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3482 #else // !wxUSE_FONTMAP
3483 wxString::Format(_("encoding %i"), m_encoding).c_str()
3484 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3485 );
3486
3487 alreadyLoggingError = false;
3488 }
3489
3490 return NULL;
3491 }
3492
3493 void wxCSConv::CreateConvIfNeeded() const
3494 {
3495 if ( m_deferred )
3496 {
3497 wxCSConv *self = (wxCSConv *)this; // const_cast
3498
3499 // if we don't have neither the name nor the encoding, use the default
3500 // encoding for this system
3501 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3502 {
3503 #if wxUSE_INTL
3504 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3505 #else
3506 // fallback to some reasonable default:
3507 self->m_encoding = wxFONTENCODING_ISO8859_1;
3508 #endif // wxUSE_INTL
3509 }
3510
3511 self->m_convReal = DoCreate();
3512 self->m_deferred = false;
3513 }
3514 }
3515
3516 bool wxCSConv::IsOk() const
3517 {
3518 CreateConvIfNeeded();
3519
3520 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3521 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3522 return true; // always ok as we do it ourselves
3523
3524 // m_convReal->IsOk() is called at its own creation, so we know it must
3525 // be ok if m_convReal is non-NULL
3526 return m_convReal != NULL;
3527 }
3528
3529 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3530 const char *src, size_t srcLen) const
3531 {
3532 CreateConvIfNeeded();
3533
3534 if (m_convReal)
3535 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3536
3537 // latin-1 (direct)
3538 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3539 }
3540
3541 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3542 const wchar_t *src, size_t srcLen) const
3543 {
3544 CreateConvIfNeeded();
3545
3546 if (m_convReal)
3547 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3548
3549 // latin-1 (direct)
3550 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3551 }
3552
3553 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3554 {
3555 CreateConvIfNeeded();
3556
3557 if (m_convReal)
3558 return m_convReal->MB2WC(buf, psz, n);
3559
3560 // latin-1 (direct)
3561 size_t len = strlen(psz);
3562
3563 if (buf)
3564 {
3565 for (size_t c = 0; c <= len; c++)
3566 buf[c] = (unsigned char)(psz[c]);
3567 }
3568
3569 return len;
3570 }
3571
3572 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3573 {
3574 CreateConvIfNeeded();
3575
3576 if (m_convReal)
3577 return m_convReal->WC2MB(buf, psz, n);
3578
3579 // latin-1 (direct)
3580 const size_t len = wxWcslen(psz);
3581 if (buf)
3582 {
3583 for (size_t c = 0; c <= len; c++)
3584 {
3585 if (psz[c] > 0xFF)
3586 return wxCONV_FAILED;
3587
3588 buf[c] = (char)psz[c];
3589 }
3590 }
3591 else
3592 {
3593 for (size_t c = 0; c <= len; c++)
3594 {
3595 if (psz[c] > 0xFF)
3596 return wxCONV_FAILED;
3597 }
3598 }
3599
3600 return len;
3601 }
3602
3603 size_t wxCSConv::GetMBNulLen() const
3604 {
3605 CreateConvIfNeeded();
3606
3607 if ( m_convReal )
3608 {
3609 return m_convReal->GetMBNulLen();
3610 }
3611
3612 return 1;
3613 }
3614
3615 // ----------------------------------------------------------------------------
3616 // globals
3617 // ----------------------------------------------------------------------------
3618
3619 #ifdef __WINDOWS__
3620 static wxMBConv_win32 wxConvLibcObj;
3621 #elif defined(__WXMAC__) && !defined(__MACH__)
3622 static wxMBConv_mac wxConvLibcObj ;
3623 #else
3624 static wxMBConvLibc wxConvLibcObj;
3625 #endif
3626
3627 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3628 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3629 static wxMBConvUTF7 wxConvUTF7Obj;
3630 static wxMBConvUTF8 wxConvUTF8Obj;
3631 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3632 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3633 #endif
3634 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3635 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3636 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3637 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3638 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3639 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3640 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3641 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3642 #ifdef __WXOSX__
3643 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3644 wxConvMacUTF8DObj;
3645 #else
3646 wxConvUTF8Obj;
3647 #endif
3648 #else // !__WXOSX__
3649 wxConvLibcObj;
3650 #endif // __WXOSX__/!__WXOSX__
3651
3652 #if wxUSE_UNICODE
3653
3654 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3655 {
3656 if ( !s )
3657 return wxWCharBuffer();
3658
3659 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3660 if ( !wbuf )
3661 wbuf = wxConvUTF8.cMB2WX(s);
3662 if ( !wbuf )
3663 wbuf = wxConvISO8859_1.cMB2WX(s);
3664
3665 return wbuf;
3666 }
3667
3668 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3669 {
3670 if ( !ws )
3671 return wxCharBuffer();
3672
3673 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3674 if ( !buf )
3675 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3676
3677 return buf;
3678 }
3679
3680 #endif // wxUSE_UNICODE
3681
3682 #else // !wxUSE_WCHAR_T
3683
3684 // stand-ins in absence of wchar_t
3685 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3686 wxConvISO8859_1,
3687 wxConvLocal,
3688 wxConvUTF8;
3689
3690 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T