]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
properly NUL-terminate the output in wxMBConvUTF16swap::WC2MB()
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
25
26 #ifdef __BORLANDC__
27 #pragma hdrstop
28 #endif
29
30 #ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33 #endif // WX_PRECOMP
34
35 #include "wx/strconv.h"
36
37 #if wxUSE_WCHAR_T
38
39 #ifdef __WINDOWS__
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #endif
43
44 #ifndef __WXWINCE__
45 #include <errno.h>
46 #endif
47
48 #include <ctype.h>
49 #include <string.h>
50 #include <stdlib.h>
51
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
55
56 #ifdef __SALFORDC__
57 #include <clib.h>
58 #endif
59
60 #ifdef HAVE_ICONV
61 #include <iconv.h>
62 #include "wx/thread.h"
63 #endif
64
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
67 #include "wx/utils.h"
68
69 #ifdef __WXMAC__
70 #ifndef __DARWIN__
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
74 #endif
75
76 #include "wx/mac/private.h" // includes mac headers
77 #endif
78
79 #define TRACE_STRCONV _T("strconv")
80
81 #if SIZEOF_WCHAR_T == 2
82 #define WC_UTF16
83 #endif
84
85 // ============================================================================
86 // implementation
87 // ============================================================================
88
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p, size_t n)
91 {
92 while ( n && *p++ == '\0' )
93 n--;
94
95 return n != 0;
96 }
97
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4
100 // ----------------------------------------------------------------------------
101
102
103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
104 {
105 if (input<=0xffff)
106 {
107 if (output)
108 *output = (wxUint16) input;
109 return 1;
110 }
111 else if (input>=0x110000)
112 {
113 return (size_t)-1;
114 }
115 else
116 {
117 if (output)
118 {
119 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
120 *output = (wxUint16) ((input&0x3ff)+0xdc00);
121 }
122 return 2;
123 }
124 }
125
126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
127 {
128 if ((*input<0xd800) || (*input>0xdfff))
129 {
130 output = *input;
131 return 1;
132 }
133 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
134 {
135 output = *input;
136 return (size_t)-1;
137 }
138 else
139 {
140 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
141 return 2;
142 }
143 }
144
145
146 // ----------------------------------------------------------------------------
147 // wxMBConv
148 // ----------------------------------------------------------------------------
149
150 size_t
151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
152 const char *src, size_t srcLen) const
153 {
154 // although new conversion classes are supposed to implement this function
155 // directly, the existins ones only implement the old MB2WC() and so, to
156 // avoid to have to rewrite all conversion classes at once, we provide a
157 // default (but not efficient) implementation of this one in terms of the
158 // old function by copying the input to ensure that it's NUL-terminated and
159 // then using MB2WC() to convert it
160
161 // the number of chars [which would be] written to dst [if it were not NULL]
162 size_t dstWritten = 0;
163
164 // the number of NULs terminating this string
165 size_t nulLen wxDUMMY_INITIALIZE(0);
166
167 // if we were not given the input size we just have to assume that the
168 // string is properly terminated as we have no way of knowing how long it
169 // is anyhow, but if we do have the size check whether there are enough
170 // NULs at the end
171 wxCharBuffer bufTmp;
172 const char *srcEnd;
173 if ( srcLen != (size_t)-1 )
174 {
175 // we need to know how to find the end of this string
176 nulLen = GetMBNulLen();
177 if ( nulLen == wxCONV_FAILED )
178 return wxCONV_FAILED;
179
180 // if there are enough NULs we can avoid the copy
181 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
182 {
183 // make a copy in order to properly NUL-terminate the string
184 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
185 char * const p = bufTmp.data();
186 memcpy(p, src, srcLen);
187 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
188 *s = '\0';
189
190 src = bufTmp;
191 }
192
193 srcEnd = src + srcLen;
194 }
195 else // quit after the first loop iteration
196 {
197 srcEnd = NULL;
198 }
199
200 for ( ;; )
201 {
202 // try to convert the current chunk
203 size_t lenChunk = MB2WC(NULL, src, 0);
204 if ( lenChunk == 0 )
205 {
206 // nothing left in the input string, conversion succeeded; but
207 // still account for the trailing NULL
208 dstWritten++;
209 break;
210 }
211
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
214
215 lenChunk++; // for trailing NUL
216
217 dstWritten += lenChunk;
218
219 if ( dst )
220 {
221 if ( dstWritten > dstLen )
222 return wxCONV_FAILED;
223
224 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
225 return wxCONV_FAILED;
226
227 dst += lenChunk;
228 }
229
230 if ( !srcEnd )
231 {
232 // we convert the entire string in this cas, as we suppose that the
233 // string is NUL-terminated and so srcEnd is not used at all
234 break;
235 }
236
237 // advance the input pointer past the end of this chunk
238 while ( NotAllNULs(src, nulLen) )
239 {
240 // notice that we must skip over multiple bytes here as we suppose
241 // that if NUL takes 2 or 4 bytes, then all the other characters do
242 // too and so if advanced by a single byte we might erroneously
243 // detect sequences of NUL bytes in the middle of the input
244 src += nulLen;
245 }
246
247 src += nulLen; // skipping over its terminator as well
248
249 // note that ">=" (and not just "==") is needed here as the terminator
250 // we skipped just above could be inside or just after the buffer
251 // delimited by inEnd
252 if ( src >= srcEnd )
253 break;
254 }
255
256 return dstWritten;
257 }
258
259 size_t
260 wxMBConv::FromWChar(char *dst, size_t dstLen,
261 const wchar_t *src, size_t srcLen) const
262 {
263 // the number of chars [which would be] written to dst [if it were not NULL]
264 size_t dstWritten = 0;
265
266 // make a copy of the input string unless it is already properly
267 // NUL-terminated
268 //
269 // if we don't know its length we have no choice but to assume that it is,
270 // indeed, properly terminated
271 wxWCharBuffer bufTmp;
272 if ( srcLen == (size_t)-1 )
273 {
274 srcLen = wxWcslen(src) + 1;
275 }
276 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
277 {
278 // make a copy in order to properly NUL-terminate the string
279 bufTmp = wxWCharBuffer(srcLen);
280 memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
281 src = bufTmp;
282 }
283
284 const size_t lenNul = GetMBNulLen();
285 for ( const wchar_t * const srcEnd = src + srcLen;
286 src < srcEnd;
287 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
288 {
289 // try to convert the current chunk
290 size_t lenChunk = WC2MB(NULL, src, 0);
291
292 if ( lenChunk == wxCONV_FAILED )
293 return wxCONV_FAILED;
294
295 lenChunk += lenNul;
296 dstWritten += lenChunk;
297
298 if ( dst )
299 {
300 if ( dstWritten > dstLen )
301 return wxCONV_FAILED;
302
303 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
304 return wxCONV_FAILED;
305
306 dst += lenChunk;
307 }
308 }
309
310 return dstWritten;
311 }
312
313 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
314 {
315 size_t rc = ToWChar(out, outLen, in);
316 if ( rc != wxCONV_FAILED )
317 {
318 // ToWChar() returns the buffer length, i.e. including the trailing
319 // NUL, while this method doesn't take it into account
320 rc--;
321 }
322
323 return rc;
324 }
325
326 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
327 {
328 size_t rc = FromWChar(out, outLen, in);
329 if ( rc != wxCONV_FAILED )
330 {
331 rc -= GetMBNulLen();
332 }
333
334 return rc;
335 }
336
337 wxMBConv::~wxMBConv()
338 {
339 // nothing to do here (necessary for Darwin linking probably)
340 }
341
342 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
343 {
344 if ( psz )
345 {
346 // calculate the length of the buffer needed first
347 const size_t nLen = MB2WC(NULL, psz, 0);
348 if ( nLen != wxCONV_FAILED )
349 {
350 // now do the actual conversion
351 wxWCharBuffer buf(nLen /* +1 added implicitly */);
352
353 // +1 for the trailing NULL
354 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
355 return buf;
356 }
357 }
358
359 return wxWCharBuffer();
360 }
361
362 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
363 {
364 if ( pwz )
365 {
366 const size_t nLen = WC2MB(NULL, pwz, 0);
367 if ( nLen != wxCONV_FAILED )
368 {
369 // extra space for trailing NUL(s)
370 static const size_t extraLen = GetMaxMBNulLen();
371
372 wxCharBuffer buf(nLen + extraLen - 1);
373 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
374 return buf;
375 }
376 }
377
378 return wxCharBuffer();
379 }
380
381 const wxWCharBuffer
382 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
383 {
384 const size_t dstLen = ToWChar(NULL, 0, in, inLen);
385 if ( dstLen != wxCONV_FAILED )
386 {
387 wxWCharBuffer wbuf(dstLen - 1);
388 if ( ToWChar(wbuf.data(), dstLen, in, inLen) )
389 {
390 if ( outLen )
391 *outLen = dstLen - 1;
392 return wbuf;
393 }
394 }
395
396 if ( outLen )
397 *outLen = 0;
398
399 return wxWCharBuffer();
400 }
401
402 const wxCharBuffer
403 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
404 {
405 const size_t dstLen = FromWChar(NULL, 0, in, inLen);
406 if ( dstLen != wxCONV_FAILED )
407 {
408 wxCharBuffer buf(dstLen - 1);
409 if ( FromWChar(buf.data(), dstLen, in, inLen) )
410 {
411 if ( outLen )
412 *outLen = dstLen - 1;
413 return buf;
414 }
415 }
416
417 if ( outLen )
418 *outLen = 0;
419
420 return wxCharBuffer();
421 }
422
423 // ----------------------------------------------------------------------------
424 // wxMBConvLibc
425 // ----------------------------------------------------------------------------
426
427 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
428 {
429 return wxMB2WC(buf, psz, n);
430 }
431
432 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
433 {
434 return wxWC2MB(buf, psz, n);
435 }
436
437 // ----------------------------------------------------------------------------
438 // wxConvBrokenFileNames
439 // ----------------------------------------------------------------------------
440
441 #ifdef __UNIX__
442
443 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
444 {
445 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
446 || wxStricmp(charset, _T("UTF8")) == 0 )
447 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
448 else
449 m_conv = new wxCSConv(charset);
450 }
451
452 #endif // __UNIX__
453
454 // ----------------------------------------------------------------------------
455 // UTF-7
456 // ----------------------------------------------------------------------------
457
458 // Implementation (C) 2004 Fredrik Roubert
459
460 //
461 // BASE64 decoding table
462 //
463 static const unsigned char utf7unb64[] =
464 {
465 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
466 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
467 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
468 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
469 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
470 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
471 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
472 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
473 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
474 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
475 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
476 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
477 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
478 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
479 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
480 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
481 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
482 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
483 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
484 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
485 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
486 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
487 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
488 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
489 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
490 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
497 };
498
499 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
500 {
501 size_t len = 0;
502
503 while ( *psz && (!buf || (len < n)) )
504 {
505 unsigned char cc = *psz++;
506 if (cc != '+')
507 {
508 // plain ASCII char
509 if (buf)
510 *buf++ = cc;
511 len++;
512 }
513 else if (*psz == '-')
514 {
515 // encoded plus sign
516 if (buf)
517 *buf++ = cc;
518 len++;
519 psz++;
520 }
521 else // start of BASE64 encoded string
522 {
523 bool lsb, ok;
524 unsigned int d, l;
525 for ( ok = lsb = false, d = 0, l = 0;
526 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
527 psz++ )
528 {
529 d <<= 6;
530 d += cc;
531 for (l += 6; l >= 8; lsb = !lsb)
532 {
533 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
534 if (lsb)
535 {
536 if (buf)
537 *buf++ |= c;
538 len ++;
539 }
540 else
541 {
542 if (buf)
543 *buf = (wchar_t)(c << 8);
544 }
545
546 ok = true;
547 }
548 }
549
550 if ( !ok )
551 {
552 // in valid UTF7 we should have valid characters after '+'
553 return (size_t)-1;
554 }
555
556 if (*psz == '-')
557 psz++;
558 }
559 }
560
561 if ( buf && (len < n) )
562 *buf = '\0';
563
564 return len;
565 }
566
567 //
568 // BASE64 encoding table
569 //
570 static const unsigned char utf7enb64[] =
571 {
572 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
573 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
574 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
575 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
576 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
577 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
578 'w', 'x', 'y', 'z', '0', '1', '2', '3',
579 '4', '5', '6', '7', '8', '9', '+', '/'
580 };
581
582 //
583 // UTF-7 encoding table
584 //
585 // 0 - Set D (directly encoded characters)
586 // 1 - Set O (optional direct characters)
587 // 2 - whitespace characters (optional)
588 // 3 - special characters
589 //
590 static const unsigned char utf7encode[128] =
591 {
592 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
593 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
594 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
596 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
598 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
600 };
601
602 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
603 {
604 size_t len = 0;
605
606 while (*psz && ((!buf) || (len < n)))
607 {
608 wchar_t cc = *psz++;
609 if (cc < 0x80 && utf7encode[cc] < 1)
610 {
611 // plain ASCII char
612 if (buf)
613 *buf++ = (char)cc;
614 len++;
615 }
616 #ifndef WC_UTF16
617 else if (((wxUint32)cc) > 0xffff)
618 {
619 // no surrogate pair generation (yet?)
620 return (size_t)-1;
621 }
622 #endif
623 else
624 {
625 if (buf)
626 *buf++ = '+';
627 len++;
628 if (cc != '+')
629 {
630 // BASE64 encode string
631 unsigned int lsb, d, l;
632 for (d = 0, l = 0; /*nothing*/; psz++)
633 {
634 for (lsb = 0; lsb < 2; lsb ++)
635 {
636 d <<= 8;
637 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
638
639 for (l += 8; l >= 6; )
640 {
641 l -= 6;
642 if (buf)
643 *buf++ = utf7enb64[(d >> l) % 64];
644 len++;
645 }
646 }
647 cc = *psz;
648 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
649 break;
650 }
651 if (l != 0)
652 {
653 if (buf)
654 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
655 len++;
656 }
657 }
658 if (buf)
659 *buf++ = '-';
660 len++;
661 }
662 }
663 if (buf && (len < n))
664 *buf = 0;
665 return len;
666 }
667
668 // ----------------------------------------------------------------------------
669 // UTF-8
670 // ----------------------------------------------------------------------------
671
672 static wxUint32 utf8_max[]=
673 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
674
675 // boundaries of the private use area we use to (temporarily) remap invalid
676 // characters invalid in a UTF-8 encoded string
677 const wxUint32 wxUnicodePUA = 0x100000;
678 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
679
680 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
681 {
682 size_t len = 0;
683
684 while (*psz && ((!buf) || (len < n)))
685 {
686 const char *opsz = psz;
687 bool invalid = false;
688 unsigned char cc = *psz++, fc = cc;
689 unsigned cnt;
690 for (cnt = 0; fc & 0x80; cnt++)
691 fc <<= 1;
692 if (!cnt)
693 {
694 // plain ASCII char
695 if (buf)
696 *buf++ = cc;
697 len++;
698
699 // escape the escape character for octal escapes
700 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
701 && cc == '\\' && (!buf || len < n))
702 {
703 if (buf)
704 *buf++ = cc;
705 len++;
706 }
707 }
708 else
709 {
710 cnt--;
711 if (!cnt)
712 {
713 // invalid UTF-8 sequence
714 invalid = true;
715 }
716 else
717 {
718 unsigned ocnt = cnt - 1;
719 wxUint32 res = cc & (0x3f >> cnt);
720 while (cnt--)
721 {
722 cc = *psz;
723 if ((cc & 0xC0) != 0x80)
724 {
725 // invalid UTF-8 sequence
726 invalid = true;
727 break;
728 }
729 psz++;
730 res = (res << 6) | (cc & 0x3f);
731 }
732 if (invalid || res <= utf8_max[ocnt])
733 {
734 // illegal UTF-8 encoding
735 invalid = true;
736 }
737 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
738 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
739 {
740 // if one of our PUA characters turns up externally
741 // it must also be treated as an illegal sequence
742 // (a bit like you have to escape an escape character)
743 invalid = true;
744 }
745 else
746 {
747 #ifdef WC_UTF16
748 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
749 size_t pa = encode_utf16(res, (wxUint16 *)buf);
750 if (pa == (size_t)-1)
751 {
752 invalid = true;
753 }
754 else
755 {
756 if (buf)
757 buf += pa;
758 len += pa;
759 }
760 #else // !WC_UTF16
761 if (buf)
762 *buf++ = (wchar_t)res;
763 len++;
764 #endif // WC_UTF16/!WC_UTF16
765 }
766 }
767 if (invalid)
768 {
769 if (m_options & MAP_INVALID_UTF8_TO_PUA)
770 {
771 while (opsz < psz && (!buf || len < n))
772 {
773 #ifdef WC_UTF16
774 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
775 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
776 wxASSERT(pa != (size_t)-1);
777 if (buf)
778 buf += pa;
779 opsz++;
780 len += pa;
781 #else
782 if (buf)
783 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
784 opsz++;
785 len++;
786 #endif
787 }
788 }
789 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
790 {
791 while (opsz < psz && (!buf || len < n))
792 {
793 if ( buf && len + 3 < n )
794 {
795 unsigned char on = *opsz;
796 *buf++ = L'\\';
797 *buf++ = (wchar_t)( L'0' + on / 0100 );
798 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
799 *buf++ = (wchar_t)( L'0' + on % 010 );
800 }
801 opsz++;
802 len += 4;
803 }
804 }
805 else // MAP_INVALID_UTF8_NOT
806 {
807 return (size_t)-1;
808 }
809 }
810 }
811 }
812 if (buf && (len < n))
813 *buf = 0;
814 return len;
815 }
816
817 static inline bool isoctal(wchar_t wch)
818 {
819 return L'0' <= wch && wch <= L'7';
820 }
821
822 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
823 {
824 size_t len = 0;
825
826 while (*psz && ((!buf) || (len < n)))
827 {
828 wxUint32 cc;
829 #ifdef WC_UTF16
830 // cast is ok for WC_UTF16
831 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
832 psz += (pa == (size_t)-1) ? 1 : pa;
833 #else
834 cc=(*psz++) & 0x7fffffff;
835 #endif
836
837 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
838 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
839 {
840 if (buf)
841 *buf++ = (char)(cc - wxUnicodePUA);
842 len++;
843 }
844 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
845 && cc == L'\\' && psz[0] == L'\\' )
846 {
847 if (buf)
848 *buf++ = (char)cc;
849 psz++;
850 len++;
851 }
852 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
853 cc == L'\\' &&
854 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
855 {
856 if (buf)
857 {
858 *buf++ = (char) ((psz[0] - L'0')*0100 +
859 (psz[1] - L'0')*010 +
860 (psz[2] - L'0'));
861 }
862
863 psz += 3;
864 len++;
865 }
866 else
867 {
868 unsigned cnt;
869 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
870 if (!cnt)
871 {
872 // plain ASCII char
873 if (buf)
874 *buf++ = (char) cc;
875 len++;
876 }
877
878 else
879 {
880 len += cnt + 1;
881 if (buf)
882 {
883 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
884 while (cnt--)
885 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
886 }
887 }
888 }
889 }
890
891 if (buf && (len<n))
892 *buf = 0;
893
894 return len;
895 }
896
897 // ----------------------------------------------------------------------------
898 // UTF-16
899 // ----------------------------------------------------------------------------
900
901 #ifdef WORDS_BIGENDIAN
902 #define wxMBConvUTF16straight wxMBConvUTF16BE
903 #define wxMBConvUTF16swap wxMBConvUTF16LE
904 #else
905 #define wxMBConvUTF16swap wxMBConvUTF16BE
906 #define wxMBConvUTF16straight wxMBConvUTF16LE
907 #endif
908
909
910 #ifdef WC_UTF16
911
912 // copy 16bit MB to 16bit String
913 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
914 {
915 size_t len=0;
916
917 while (*(wxUint16*)psz && (!buf || len < n))
918 {
919 if (buf)
920 *buf++ = *(wxUint16*)psz;
921 len++;
922
923 psz += sizeof(wxUint16);
924 }
925 if (buf && len<n) *buf=0;
926
927 return len;
928 }
929
930
931 // copy 16bit String to 16bit MB
932 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
933 {
934 size_t len=0;
935
936 while (*psz && (!buf || len < n))
937 {
938 if (buf)
939 {
940 *(wxUint16*)buf = *psz;
941 buf += sizeof(wxUint16);
942 }
943 len += sizeof(wxUint16);
944 psz++;
945 }
946 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
947
948 return len;
949 }
950
951
952 // swap 16bit MB to 16bit String
953 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
954 {
955 size_t len = 0;
956
957 // UTF16 string must be terminated by 2 NULs as single NULs may occur
958 // inside the string
959 while ( (psz[0] || psz[1]) && (!buf || len < n) )
960 {
961 if ( buf )
962 {
963 ((char *)buf)[0] = psz[1];
964 ((char *)buf)[1] = psz[0];
965 buf++;
966 }
967 len++;
968 psz += 2;
969 }
970
971 if ( buf && len < n )
972 *buf = L'\0';
973
974 return len;
975 }
976
977
978 // swap 16bit MB to 16bit String
979 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
980 {
981 size_t len = 0;
982
983 while ( *psz && (!buf || len < n) )
984 {
985 if ( buf )
986 {
987 *buf++ = ((char*)psz)[1];
988 *buf++ = ((char*)psz)[0];
989 }
990 len += 2;
991 psz++;
992 }
993
994 if ( buf && len < n - 1 )
995 {
996 buf[0] =
997 buf[1] = '\0';
998 }
999
1000 return len;
1001 }
1002
1003
1004 #else // WC_UTF16
1005
1006
1007 // copy 16bit MB to 32bit String
1008 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1009 {
1010 size_t len=0;
1011
1012 while (*(wxUint16*)psz && (!buf || len < n))
1013 {
1014 wxUint32 cc;
1015 size_t pa=decode_utf16((wxUint16*)psz, cc);
1016 if (pa == (size_t)-1)
1017 return pa;
1018
1019 if (buf)
1020 *buf++ = (wchar_t)cc;
1021 len++;
1022 psz += pa * sizeof(wxUint16);
1023 }
1024 if (buf && len<n) *buf=0;
1025
1026 return len;
1027 }
1028
1029
1030 // copy 32bit String to 16bit MB
1031 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1032 {
1033 size_t len=0;
1034
1035 while (*psz && (!buf || len < n))
1036 {
1037 wxUint16 cc[2];
1038 size_t pa=encode_utf16(*psz, cc);
1039
1040 if (pa == (size_t)-1)
1041 return pa;
1042
1043 if (buf)
1044 {
1045 *(wxUint16*)buf = cc[0];
1046 buf += sizeof(wxUint16);
1047 if (pa > 1)
1048 {
1049 *(wxUint16*)buf = cc[1];
1050 buf += sizeof(wxUint16);
1051 }
1052 }
1053
1054 len += pa*sizeof(wxUint16);
1055 psz++;
1056 }
1057 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1058
1059 return len;
1060 }
1061
1062
1063 // swap 16bit MB to 32bit String
1064 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1065 {
1066 size_t len=0;
1067
1068 while (*(wxUint16*)psz && (!buf || len < n))
1069 {
1070 wxUint32 cc;
1071 char tmp[4];
1072 tmp[0]=psz[1]; tmp[1]=psz[0];
1073 tmp[2]=psz[3]; tmp[3]=psz[2];
1074
1075 size_t pa=decode_utf16((wxUint16*)tmp, cc);
1076 if (pa == (size_t)-1)
1077 return pa;
1078
1079 if (buf)
1080 *buf++ = (wchar_t)cc;
1081
1082 len++;
1083 psz += pa * sizeof(wxUint16);
1084 }
1085 if (buf && len<n) *buf=0;
1086
1087 return len;
1088 }
1089
1090
1091 // swap 32bit String to 16bit MB
1092 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1093 {
1094 size_t len=0;
1095
1096 while (*psz && (!buf || len < n))
1097 {
1098 wxUint16 cc[2];
1099 size_t pa=encode_utf16(*psz, cc);
1100
1101 if (pa == (size_t)-1)
1102 return pa;
1103
1104 if (buf)
1105 {
1106 *buf++ = ((char*)cc)[1];
1107 *buf++ = ((char*)cc)[0];
1108 if (pa > 1)
1109 {
1110 *buf++ = ((char*)cc)[3];
1111 *buf++ = ((char*)cc)[2];
1112 }
1113 }
1114
1115 len += pa*sizeof(wxUint16);
1116 psz++;
1117 }
1118 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1119
1120 return len;
1121 }
1122
1123 #endif // WC_UTF16
1124
1125
1126 // ----------------------------------------------------------------------------
1127 // UTF-32
1128 // ----------------------------------------------------------------------------
1129
1130 #ifdef WORDS_BIGENDIAN
1131 #define wxMBConvUTF32straight wxMBConvUTF32BE
1132 #define wxMBConvUTF32swap wxMBConvUTF32LE
1133 #else
1134 #define wxMBConvUTF32swap wxMBConvUTF32BE
1135 #define wxMBConvUTF32straight wxMBConvUTF32LE
1136 #endif
1137
1138
1139 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1140 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1141
1142
1143 #ifdef WC_UTF16
1144
1145 // copy 32bit MB to 16bit String
1146 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1147 {
1148 size_t len=0;
1149
1150 while (*(wxUint32*)psz && (!buf || len < n))
1151 {
1152 wxUint16 cc[2];
1153
1154 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1155 if (pa == (size_t)-1)
1156 return pa;
1157
1158 if (buf)
1159 {
1160 *buf++ = cc[0];
1161 if (pa > 1)
1162 *buf++ = cc[1];
1163 }
1164 len += pa;
1165 psz += sizeof(wxUint32);
1166 }
1167 if (buf && len<n) *buf=0;
1168
1169 return len;
1170 }
1171
1172
1173 // copy 16bit String to 32bit MB
1174 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1175 {
1176 size_t len=0;
1177
1178 while (*psz && (!buf || len < n))
1179 {
1180 wxUint32 cc;
1181
1182 // cast is ok for WC_UTF16
1183 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1184 if (pa == (size_t)-1)
1185 return pa;
1186
1187 if (buf)
1188 {
1189 *(wxUint32*)buf = cc;
1190 buf += sizeof(wxUint32);
1191 }
1192 len += sizeof(wxUint32);
1193 psz += pa;
1194 }
1195
1196 if (buf && len<=n-sizeof(wxUint32))
1197 *(wxUint32*)buf=0;
1198
1199 return len;
1200 }
1201
1202
1203
1204 // swap 32bit MB to 16bit String
1205 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1206 {
1207 size_t len=0;
1208
1209 while (*(wxUint32*)psz && (!buf || len < n))
1210 {
1211 char tmp[4];
1212 tmp[0] = psz[3]; tmp[1] = psz[2];
1213 tmp[2] = psz[1]; tmp[3] = psz[0];
1214
1215
1216 wxUint16 cc[2];
1217
1218 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1219 if (pa == (size_t)-1)
1220 return pa;
1221
1222 if (buf)
1223 {
1224 *buf++ = cc[0];
1225 if (pa > 1)
1226 *buf++ = cc[1];
1227 }
1228 len += pa;
1229 psz += sizeof(wxUint32);
1230 }
1231
1232 if (buf && len<n)
1233 *buf=0;
1234
1235 return len;
1236 }
1237
1238
1239 // swap 16bit String to 32bit MB
1240 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1241 {
1242 size_t len=0;
1243
1244 while (*psz && (!buf || len < n))
1245 {
1246 char cc[4];
1247
1248 // cast is ok for WC_UTF16
1249 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1250 if (pa == (size_t)-1)
1251 return pa;
1252
1253 if (buf)
1254 {
1255 *buf++ = cc[3];
1256 *buf++ = cc[2];
1257 *buf++ = cc[1];
1258 *buf++ = cc[0];
1259 }
1260 len += sizeof(wxUint32);
1261 psz += pa;
1262 }
1263
1264 if (buf && len<=n-sizeof(wxUint32))
1265 *(wxUint32*)buf=0;
1266
1267 return len;
1268 }
1269
1270 #else // WC_UTF16
1271
1272
1273 // copy 32bit MB to 32bit String
1274 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1275 {
1276 size_t len=0;
1277
1278 while (*(wxUint32*)psz && (!buf || len < n))
1279 {
1280 if (buf)
1281 *buf++ = (wchar_t)(*(wxUint32*)psz);
1282 len++;
1283 psz += sizeof(wxUint32);
1284 }
1285
1286 if (buf && len<n)
1287 *buf=0;
1288
1289 return len;
1290 }
1291
1292
1293 // copy 32bit String to 32bit MB
1294 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1295 {
1296 size_t len=0;
1297
1298 while (*psz && (!buf || len < n))
1299 {
1300 if (buf)
1301 {
1302 *(wxUint32*)buf = *psz;
1303 buf += sizeof(wxUint32);
1304 }
1305
1306 len += sizeof(wxUint32);
1307 psz++;
1308 }
1309
1310 if (buf && len<=n-sizeof(wxUint32))
1311 *(wxUint32*)buf=0;
1312
1313 return len;
1314 }
1315
1316
1317 // swap 32bit MB to 32bit String
1318 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1319 {
1320 size_t len=0;
1321
1322 while (*(wxUint32*)psz && (!buf || len < n))
1323 {
1324 if (buf)
1325 {
1326 ((char *)buf)[0] = psz[3];
1327 ((char *)buf)[1] = psz[2];
1328 ((char *)buf)[2] = psz[1];
1329 ((char *)buf)[3] = psz[0];
1330 buf++;
1331 }
1332 len++;
1333 psz += sizeof(wxUint32);
1334 }
1335
1336 if (buf && len<n)
1337 *buf=0;
1338
1339 return len;
1340 }
1341
1342
1343 // swap 32bit String to 32bit MB
1344 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1345 {
1346 size_t len=0;
1347
1348 while (*psz && (!buf || len < n))
1349 {
1350 if (buf)
1351 {
1352 *buf++ = ((char *)psz)[3];
1353 *buf++ = ((char *)psz)[2];
1354 *buf++ = ((char *)psz)[1];
1355 *buf++ = ((char *)psz)[0];
1356 }
1357 len += sizeof(wxUint32);
1358 psz++;
1359 }
1360
1361 if (buf && len<=n-sizeof(wxUint32))
1362 *(wxUint32*)buf=0;
1363
1364 return len;
1365 }
1366
1367
1368 #endif // WC_UTF16
1369
1370
1371 // ============================================================================
1372 // The classes doing conversion using the iconv_xxx() functions
1373 // ============================================================================
1374
1375 #ifdef HAVE_ICONV
1376
1377 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1378 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1379 // (unless there's yet another bug in glibc) the only case when iconv()
1380 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1381 // left in the input buffer -- when _real_ error occurs,
1382 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1383 // iconv() failure.
1384 // [This bug does not appear in glibc 2.2.]
1385 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1386 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1387 (errno != E2BIG || bufLeft != 0))
1388 #else
1389 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1390 #endif
1391
1392 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1393
1394 #define ICONV_T_INVALID ((iconv_t)-1)
1395
1396 #if SIZEOF_WCHAR_T == 4
1397 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1398 #define WC_ENC wxFONTENCODING_UTF32
1399 #elif SIZEOF_WCHAR_T == 2
1400 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1401 #define WC_ENC wxFONTENCODING_UTF16
1402 #else // sizeof(wchar_t) != 2 nor 4
1403 // does this ever happen?
1404 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1405 #endif
1406
1407 // ----------------------------------------------------------------------------
1408 // wxMBConv_iconv: encapsulates an iconv character set
1409 // ----------------------------------------------------------------------------
1410
1411 class wxMBConv_iconv : public wxMBConv
1412 {
1413 public:
1414 wxMBConv_iconv(const wxChar *name);
1415 virtual ~wxMBConv_iconv();
1416
1417 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1418 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1419
1420 // classify this encoding as explained in wxMBConv::GetMBNulLen()
1421 // comment
1422 virtual size_t GetMBNulLen() const;
1423
1424 bool IsOk() const
1425 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1426
1427 protected:
1428 // the iconv handlers used to translate from multibyte to wide char and in
1429 // the other direction
1430 iconv_t m2w,
1431 w2m;
1432 #if wxUSE_THREADS
1433 // guards access to m2w and w2m objects
1434 wxMutex m_iconvMutex;
1435 #endif
1436
1437 private:
1438 // the name (for iconv_open()) of a wide char charset -- if none is
1439 // available on this machine, it will remain NULL
1440 static wxString ms_wcCharsetName;
1441
1442 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1443 // different endian-ness than the native one
1444 static bool ms_wcNeedsSwap;
1445
1446 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1447 // initially
1448 size_t m_minMBCharWidth;
1449 };
1450
1451 // make the constructor available for unit testing
1452 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1453 {
1454 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1455 if ( !result->IsOk() )
1456 {
1457 delete result;
1458 return 0;
1459 }
1460 return result;
1461 }
1462
1463 wxString wxMBConv_iconv::ms_wcCharsetName;
1464 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1465
1466 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1467 {
1468 m_minMBCharWidth = 0;
1469
1470 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1471 // names for the charsets
1472 const wxCharBuffer cname(wxString(name).ToAscii());
1473
1474 // check for charset that represents wchar_t:
1475 if ( ms_wcCharsetName.empty() )
1476 {
1477 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1478
1479 #if wxUSE_FONTMAP
1480 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1481 #else // !wxUSE_FONTMAP
1482 static const wxChar *names[] =
1483 {
1484 #if SIZEOF_WCHAR_T == 4
1485 _T("UCS-4"),
1486 #elif SIZEOF_WCHAR_T = 2
1487 _T("UCS-2"),
1488 #endif
1489 NULL
1490 };
1491 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1492
1493 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1494 {
1495 const wxString nameCS(*names);
1496
1497 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1498 wxString nameXE(nameCS);
1499 #ifdef WORDS_BIGENDIAN
1500 nameXE += _T("BE");
1501 #else // little endian
1502 nameXE += _T("LE");
1503 #endif
1504
1505 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1506 nameXE.c_str());
1507
1508 m2w = iconv_open(nameXE.ToAscii(), cname);
1509 if ( m2w == ICONV_T_INVALID )
1510 {
1511 // try charset w/o bytesex info (e.g. "UCS4")
1512 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1513 nameCS.c_str());
1514 m2w = iconv_open(nameCS.ToAscii(), cname);
1515
1516 // and check for bytesex ourselves:
1517 if ( m2w != ICONV_T_INVALID )
1518 {
1519 char buf[2], *bufPtr;
1520 wchar_t wbuf[2], *wbufPtr;
1521 size_t insz, outsz;
1522 size_t res;
1523
1524 buf[0] = 'A';
1525 buf[1] = 0;
1526 wbuf[0] = 0;
1527 insz = 2;
1528 outsz = SIZEOF_WCHAR_T * 2;
1529 wbufPtr = wbuf;
1530 bufPtr = buf;
1531
1532 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1533 (char**)&wbufPtr, &outsz);
1534
1535 if (ICONV_FAILED(res, insz))
1536 {
1537 wxLogLastError(wxT("iconv"));
1538 wxLogError(_("Conversion to charset '%s' doesn't work."),
1539 nameCS.c_str());
1540 }
1541 else // ok, can convert to this encoding, remember it
1542 {
1543 ms_wcCharsetName = nameCS;
1544 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1545 }
1546 }
1547 }
1548 else // use charset not requiring byte swapping
1549 {
1550 ms_wcCharsetName = nameXE;
1551 }
1552 }
1553
1554 wxLogTrace(TRACE_STRCONV,
1555 wxT("iconv wchar_t charset is \"%s\"%s"),
1556 ms_wcCharsetName.empty() ? _T("<none>")
1557 : ms_wcCharsetName.c_str(),
1558 ms_wcNeedsSwap ? _T(" (needs swap)")
1559 : _T(""));
1560 }
1561 else // we already have ms_wcCharsetName
1562 {
1563 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1564 }
1565
1566 if ( ms_wcCharsetName.empty() )
1567 {
1568 w2m = ICONV_T_INVALID;
1569 }
1570 else
1571 {
1572 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1573 if ( w2m == ICONV_T_INVALID )
1574 {
1575 wxLogTrace(TRACE_STRCONV,
1576 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1577 ms_wcCharsetName.c_str(), cname.data());
1578 }
1579 }
1580 }
1581
1582 wxMBConv_iconv::~wxMBConv_iconv()
1583 {
1584 if ( m2w != ICONV_T_INVALID )
1585 iconv_close(m2w);
1586 if ( w2m != ICONV_T_INVALID )
1587 iconv_close(w2m);
1588 }
1589
1590 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1591 {
1592 // find the string length: notice that must be done differently for
1593 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1594 size_t inbuf;
1595 const size_t nulLen = GetMBNulLen();
1596 switch ( nulLen )
1597 {
1598 default:
1599 return (size_t)-1;
1600
1601 case 1:
1602 inbuf = strlen(psz); // arguably more optimized than our version
1603 break;
1604
1605 case 2:
1606 case 4:
1607 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1608 // they also have to start at character boundary and not span two
1609 // adjacent characters
1610 const char *p;
1611 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1612 ;
1613 inbuf = p - psz;
1614 break;
1615 }
1616
1617 #if wxUSE_THREADS
1618 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1619 // Unfortunately there is a couple of global wxCSConv objects such as
1620 // wxConvLocal that are used all over wx code, so we have to make sure
1621 // the handle is used by at most one thread at the time. Otherwise
1622 // only a few wx classes would be safe to use from non-main threads
1623 // as MB<->WC conversion would fail "randomly".
1624 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1625 #endif // wxUSE_THREADS
1626
1627
1628 size_t outbuf = n * SIZEOF_WCHAR_T;
1629 size_t res, cres;
1630 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1631 wchar_t *bufPtr = buf;
1632 const char *pszPtr = psz;
1633
1634 if (buf)
1635 {
1636 // have destination buffer, convert there
1637 cres = iconv(m2w,
1638 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1639 (char**)&bufPtr, &outbuf);
1640 res = n - (outbuf / SIZEOF_WCHAR_T);
1641
1642 if (ms_wcNeedsSwap)
1643 {
1644 // convert to native endianness
1645 for ( unsigned i = 0; i < res; i++ )
1646 buf[n] = WC_BSWAP(buf[i]);
1647 }
1648
1649 // NUL-terminate the string if there is any space left
1650 if (res < n)
1651 buf[res] = 0;
1652 }
1653 else
1654 {
1655 // no destination buffer... convert using temp buffer
1656 // to calculate destination buffer requirement
1657 wchar_t tbuf[8];
1658 res = 0;
1659 do {
1660 bufPtr = tbuf;
1661 outbuf = 8*SIZEOF_WCHAR_T;
1662
1663 cres = iconv(m2w,
1664 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1665 (char**)&bufPtr, &outbuf );
1666
1667 res += 8-(outbuf/SIZEOF_WCHAR_T);
1668 } while ((cres==(size_t)-1) && (errno==E2BIG));
1669 }
1670
1671 if (ICONV_FAILED(cres, inbuf))
1672 {
1673 //VS: it is ok if iconv fails, hence trace only
1674 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1675 return (size_t)-1;
1676 }
1677
1678 return res;
1679 }
1680
1681 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1682 {
1683 #if wxUSE_THREADS
1684 // NB: explained in MB2WC
1685 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1686 #endif
1687
1688 size_t inlen = wxWcslen(psz);
1689 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1690 size_t outbuf = n;
1691 size_t res, cres;
1692
1693 wchar_t *tmpbuf = 0;
1694
1695 if (ms_wcNeedsSwap)
1696 {
1697 // need to copy to temp buffer to switch endianness
1698 // (doing WC_BSWAP twice on the original buffer won't help, as it
1699 // could be in read-only memory, or be accessed in some other thread)
1700 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1701 for ( size_t i = 0; i < inlen; i++ )
1702 tmpbuf[n] = WC_BSWAP(psz[i]);
1703 tmpbuf[inlen] = L'\0';
1704 psz = tmpbuf;
1705 }
1706
1707 if (buf)
1708 {
1709 // have destination buffer, convert there
1710 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1711
1712 res = n-outbuf;
1713
1714 // NB: iconv was given only wcslen(psz) characters on input, and so
1715 // it couldn't convert the trailing zero. Let's do it ourselves
1716 // if there's some room left for it in the output buffer.
1717 if (res < n)
1718 buf[0] = 0;
1719 }
1720 else
1721 {
1722 // no destination buffer... convert using temp buffer
1723 // to calculate destination buffer requirement
1724 char tbuf[16];
1725 res = 0;
1726 do {
1727 buf = tbuf; outbuf = 16;
1728
1729 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1730
1731 res += 16 - outbuf;
1732 } while ((cres==(size_t)-1) && (errno==E2BIG));
1733 }
1734
1735 if (ms_wcNeedsSwap)
1736 {
1737 free(tmpbuf);
1738 }
1739
1740 if (ICONV_FAILED(cres, inbuf))
1741 {
1742 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1743 return (size_t)-1;
1744 }
1745
1746 return res;
1747 }
1748
1749 size_t wxMBConv_iconv::GetMBNulLen() const
1750 {
1751 if ( m_minMBCharWidth == 0 )
1752 {
1753 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1754
1755 #if wxUSE_THREADS
1756 // NB: explained in MB2WC
1757 wxMutexLocker lock(self->m_iconvMutex);
1758 #endif
1759
1760 wchar_t *wnul = L"";
1761 char buf[8]; // should be enough for NUL in any encoding
1762 size_t inLen = sizeof(wchar_t),
1763 outLen = WXSIZEOF(buf);
1764 char *in = (char *)wnul;
1765 char *out = buf;
1766 if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1767 {
1768 self->m_minMBCharWidth = (size_t)-1;
1769 }
1770 else // ok
1771 {
1772 self->m_minMBCharWidth = out - buf;
1773 }
1774 }
1775
1776 return m_minMBCharWidth;
1777 }
1778
1779 #endif // HAVE_ICONV
1780
1781
1782 // ============================================================================
1783 // Win32 conversion classes
1784 // ============================================================================
1785
1786 #ifdef wxHAVE_WIN32_MB2WC
1787
1788 // from utils.cpp
1789 #if wxUSE_FONTMAP
1790 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1791 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1792 #endif
1793
1794 class wxMBConv_win32 : public wxMBConv
1795 {
1796 public:
1797 wxMBConv_win32()
1798 {
1799 m_CodePage = CP_ACP;
1800 m_minMBCharWidth = 0;
1801 }
1802
1803 #if wxUSE_FONTMAP
1804 wxMBConv_win32(const wxChar* name)
1805 {
1806 m_CodePage = wxCharsetToCodepage(name);
1807 m_minMBCharWidth = 0;
1808 }
1809
1810 wxMBConv_win32(wxFontEncoding encoding)
1811 {
1812 m_CodePage = wxEncodingToCodepage(encoding);
1813 m_minMBCharWidth = 0;
1814 }
1815 #endif // wxUSE_FONTMAP
1816
1817 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1818 {
1819 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1820 // the behaviour is not compatible with the Unix version (using iconv)
1821 // and break the library itself, e.g. wxTextInputStream::NextChar()
1822 // wouldn't work if reading an incomplete MB char didn't result in an
1823 // error
1824 //
1825 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1826 // Win XP or newer and it is not supported for UTF-[78] so we always
1827 // use our own conversions in this case. See
1828 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1829 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1830 if ( m_CodePage == CP_UTF8 )
1831 {
1832 return wxConvUTF8.MB2WC(buf, psz, n);
1833 }
1834
1835 if ( m_CodePage == CP_UTF7 )
1836 {
1837 return wxConvUTF7.MB2WC(buf, psz, n);
1838 }
1839
1840 int flags = 0;
1841 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
1842 IsAtLeastWin2kSP4() )
1843 {
1844 flags = MB_ERR_INVALID_CHARS;
1845 }
1846
1847 const size_t len = ::MultiByteToWideChar
1848 (
1849 m_CodePage, // code page
1850 flags, // flags: fall on error
1851 psz, // input string
1852 -1, // its length (NUL-terminated)
1853 buf, // output string
1854 buf ? n : 0 // size of output buffer
1855 );
1856 if ( !len )
1857 {
1858 // function totally failed
1859 return (size_t)-1;
1860 }
1861
1862 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1863 // check if we succeeded, by doing a double trip:
1864 if ( !flags && buf )
1865 {
1866 const size_t mbLen = strlen(psz);
1867 wxCharBuffer mbBuf(mbLen);
1868 if ( ::WideCharToMultiByte
1869 (
1870 m_CodePage,
1871 0,
1872 buf,
1873 -1,
1874 mbBuf.data(),
1875 mbLen + 1, // size in bytes, not length
1876 NULL,
1877 NULL
1878 ) == 0 ||
1879 strcmp(mbBuf, psz) != 0 )
1880 {
1881 // we didn't obtain the same thing we started from, hence
1882 // the conversion was lossy and we consider that it failed
1883 return (size_t)-1;
1884 }
1885 }
1886
1887 // note that it returns count of written chars for buf != NULL and size
1888 // of the needed buffer for buf == NULL so in either case the length of
1889 // the string (which never includes the terminating NUL) is one less
1890 return len - 1;
1891 }
1892
1893 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1894 {
1895 /*
1896 we have a problem here: by default, WideCharToMultiByte() may
1897 replace characters unrepresentable in the target code page with bad
1898 quality approximations such as turning "1/2" symbol (U+00BD) into
1899 "1" for the code pages which don't have it and we, obviously, want
1900 to avoid this at any price
1901
1902 the trouble is that this function does it _silently_, i.e. it won't
1903 even tell us whether it did or not... Win98/2000 and higher provide
1904 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1905 we have to resort to a round trip, i.e. check that converting back
1906 results in the same string -- this is, of course, expensive but
1907 otherwise we simply can't be sure to not garble the data.
1908 */
1909
1910 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1911 // it doesn't work with CJK encodings (which we test for rather roughly
1912 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1913 // supporting it
1914 BOOL usedDef wxDUMMY_INITIALIZE(false);
1915 BOOL *pUsedDef;
1916 int flags;
1917 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1918 {
1919 // it's our lucky day
1920 flags = WC_NO_BEST_FIT_CHARS;
1921 pUsedDef = &usedDef;
1922 }
1923 else // old system or unsupported encoding
1924 {
1925 flags = 0;
1926 pUsedDef = NULL;
1927 }
1928
1929 const size_t len = ::WideCharToMultiByte
1930 (
1931 m_CodePage, // code page
1932 flags, // either none or no best fit
1933 pwz, // input string
1934 -1, // it is (wide) NUL-terminated
1935 buf, // output buffer
1936 buf ? n : 0, // and its size
1937 NULL, // default "replacement" char
1938 pUsedDef // [out] was it used?
1939 );
1940
1941 if ( !len )
1942 {
1943 // function totally failed
1944 return (size_t)-1;
1945 }
1946
1947 // if we were really converting, check if we succeeded
1948 if ( buf )
1949 {
1950 if ( flags )
1951 {
1952 // check if the conversion failed, i.e. if any replacements
1953 // were done
1954 if ( usedDef )
1955 return (size_t)-1;
1956 }
1957 else // we must resort to double tripping...
1958 {
1959 wxWCharBuffer wcBuf(n);
1960 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1961 wcscmp(wcBuf, pwz) != 0 )
1962 {
1963 // we didn't obtain the same thing we started from, hence
1964 // the conversion was lossy and we consider that it failed
1965 return (size_t)-1;
1966 }
1967 }
1968 }
1969
1970 // see the comment above for the reason of "len - 1"
1971 return len - 1;
1972 }
1973
1974 virtual size_t GetMBNulLen() const
1975 {
1976 if ( m_minMBCharWidth == 0 )
1977 {
1978 int len = ::WideCharToMultiByte
1979 (
1980 m_CodePage, // code page
1981 0, // no flags
1982 L"", // input string
1983 1, // translate just the NUL
1984 NULL, // output buffer
1985 0, // and its size
1986 NULL, // no replacement char
1987 NULL // [out] don't care if it was used
1988 );
1989
1990 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1991 switch ( len )
1992 {
1993 default:
1994 wxLogDebug(_T("Unexpected NUL length %d"), len);
1995 // fall through
1996
1997 case 0:
1998 self->m_minMBCharWidth = (size_t)-1;
1999 break;
2000
2001 case 1:
2002 case 2:
2003 case 4:
2004 self->m_minMBCharWidth = len;
2005 break;
2006 }
2007 }
2008
2009 return m_minMBCharWidth;
2010 }
2011
2012 bool IsOk() const { return m_CodePage != -1; }
2013
2014 private:
2015 static bool CanUseNoBestFit()
2016 {
2017 static int s_isWin98Or2k = -1;
2018
2019 if ( s_isWin98Or2k == -1 )
2020 {
2021 int verMaj, verMin;
2022 switch ( wxGetOsVersion(&verMaj, &verMin) )
2023 {
2024 case wxWIN95:
2025 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2026 break;
2027
2028 case wxWINDOWS_NT:
2029 s_isWin98Or2k = verMaj >= 5;
2030 break;
2031
2032 default:
2033 // unknown, be conseravtive by default
2034 s_isWin98Or2k = 0;
2035 }
2036
2037 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2038 }
2039
2040 return s_isWin98Or2k == 1;
2041 }
2042
2043 static bool IsAtLeastWin2kSP4()
2044 {
2045 #ifdef __WXWINCE__
2046 return false;
2047 #else
2048 static int s_isAtLeastWin2kSP4 = -1;
2049
2050 if ( s_isAtLeastWin2kSP4 == -1 )
2051 {
2052 OSVERSIONINFOEX ver;
2053
2054 memset(&ver, 0, sizeof(ver));
2055 ver.dwOSVersionInfoSize = sizeof(ver);
2056 GetVersionEx((OSVERSIONINFO*)&ver);
2057
2058 s_isAtLeastWin2kSP4 =
2059 ((ver.dwMajorVersion > 5) || // Vista+
2060 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2061 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2062 ver.wServicePackMajor >= 4)) // 2000 SP4+
2063 ? 1 : 0;
2064 }
2065
2066 return s_isAtLeastWin2kSP4 == 1;
2067 #endif
2068 }
2069
2070
2071 // the code page we're working with
2072 long m_CodePage;
2073
2074 // cached result of GetMBNulLen(), set to 0 initially meaning
2075 // "unknown"
2076 size_t m_minMBCharWidth;
2077 };
2078
2079 #endif // wxHAVE_WIN32_MB2WC
2080
2081 // ============================================================================
2082 // Cocoa conversion classes
2083 // ============================================================================
2084
2085 #if defined(__WXCOCOA__)
2086
2087 // RN: There is no UTF-32 support in either Core Foundation or
2088 // Cocoa. Strangely enough, internally Core Foundation uses
2089 // UTF 32 internally quite a bit - its just not public (yet).
2090
2091 #include <CoreFoundation/CFString.h>
2092 #include <CoreFoundation/CFStringEncodingExt.h>
2093
2094 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2095 {
2096 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2097 if ( encoding == wxFONTENCODING_DEFAULT )
2098 {
2099 enc = CFStringGetSystemEncoding();
2100 }
2101 else switch( encoding)
2102 {
2103 case wxFONTENCODING_ISO8859_1 :
2104 enc = kCFStringEncodingISOLatin1 ;
2105 break ;
2106 case wxFONTENCODING_ISO8859_2 :
2107 enc = kCFStringEncodingISOLatin2;
2108 break ;
2109 case wxFONTENCODING_ISO8859_3 :
2110 enc = kCFStringEncodingISOLatin3 ;
2111 break ;
2112 case wxFONTENCODING_ISO8859_4 :
2113 enc = kCFStringEncodingISOLatin4;
2114 break ;
2115 case wxFONTENCODING_ISO8859_5 :
2116 enc = kCFStringEncodingISOLatinCyrillic;
2117 break ;
2118 case wxFONTENCODING_ISO8859_6 :
2119 enc = kCFStringEncodingISOLatinArabic;
2120 break ;
2121 case wxFONTENCODING_ISO8859_7 :
2122 enc = kCFStringEncodingISOLatinGreek;
2123 break ;
2124 case wxFONTENCODING_ISO8859_8 :
2125 enc = kCFStringEncodingISOLatinHebrew;
2126 break ;
2127 case wxFONTENCODING_ISO8859_9 :
2128 enc = kCFStringEncodingISOLatin5;
2129 break ;
2130 case wxFONTENCODING_ISO8859_10 :
2131 enc = kCFStringEncodingISOLatin6;
2132 break ;
2133 case wxFONTENCODING_ISO8859_11 :
2134 enc = kCFStringEncodingISOLatinThai;
2135 break ;
2136 case wxFONTENCODING_ISO8859_13 :
2137 enc = kCFStringEncodingISOLatin7;
2138 break ;
2139 case wxFONTENCODING_ISO8859_14 :
2140 enc = kCFStringEncodingISOLatin8;
2141 break ;
2142 case wxFONTENCODING_ISO8859_15 :
2143 enc = kCFStringEncodingISOLatin9;
2144 break ;
2145
2146 case wxFONTENCODING_KOI8 :
2147 enc = kCFStringEncodingKOI8_R;
2148 break ;
2149 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2150 enc = kCFStringEncodingDOSRussian;
2151 break ;
2152
2153 // case wxFONTENCODING_BULGARIAN :
2154 // enc = ;
2155 // break ;
2156
2157 case wxFONTENCODING_CP437 :
2158 enc =kCFStringEncodingDOSLatinUS ;
2159 break ;
2160 case wxFONTENCODING_CP850 :
2161 enc = kCFStringEncodingDOSLatin1;
2162 break ;
2163 case wxFONTENCODING_CP852 :
2164 enc = kCFStringEncodingDOSLatin2;
2165 break ;
2166 case wxFONTENCODING_CP855 :
2167 enc = kCFStringEncodingDOSCyrillic;
2168 break ;
2169 case wxFONTENCODING_CP866 :
2170 enc =kCFStringEncodingDOSRussian ;
2171 break ;
2172 case wxFONTENCODING_CP874 :
2173 enc = kCFStringEncodingDOSThai;
2174 break ;
2175 case wxFONTENCODING_CP932 :
2176 enc = kCFStringEncodingDOSJapanese;
2177 break ;
2178 case wxFONTENCODING_CP936 :
2179 enc =kCFStringEncodingDOSChineseSimplif ;
2180 break ;
2181 case wxFONTENCODING_CP949 :
2182 enc = kCFStringEncodingDOSKorean;
2183 break ;
2184 case wxFONTENCODING_CP950 :
2185 enc = kCFStringEncodingDOSChineseTrad;
2186 break ;
2187 case wxFONTENCODING_CP1250 :
2188 enc = kCFStringEncodingWindowsLatin2;
2189 break ;
2190 case wxFONTENCODING_CP1251 :
2191 enc =kCFStringEncodingWindowsCyrillic ;
2192 break ;
2193 case wxFONTENCODING_CP1252 :
2194 enc =kCFStringEncodingWindowsLatin1 ;
2195 break ;
2196 case wxFONTENCODING_CP1253 :
2197 enc = kCFStringEncodingWindowsGreek;
2198 break ;
2199 case wxFONTENCODING_CP1254 :
2200 enc = kCFStringEncodingWindowsLatin5;
2201 break ;
2202 case wxFONTENCODING_CP1255 :
2203 enc =kCFStringEncodingWindowsHebrew ;
2204 break ;
2205 case wxFONTENCODING_CP1256 :
2206 enc =kCFStringEncodingWindowsArabic ;
2207 break ;
2208 case wxFONTENCODING_CP1257 :
2209 enc = kCFStringEncodingWindowsBalticRim;
2210 break ;
2211 // This only really encodes to UTF7 (if that) evidently
2212 // case wxFONTENCODING_UTF7 :
2213 // enc = kCFStringEncodingNonLossyASCII ;
2214 // break ;
2215 case wxFONTENCODING_UTF8 :
2216 enc = kCFStringEncodingUTF8 ;
2217 break ;
2218 case wxFONTENCODING_EUC_JP :
2219 enc = kCFStringEncodingEUC_JP;
2220 break ;
2221 case wxFONTENCODING_UTF16 :
2222 enc = kCFStringEncodingUnicode ;
2223 break ;
2224 case wxFONTENCODING_MACROMAN :
2225 enc = kCFStringEncodingMacRoman ;
2226 break ;
2227 case wxFONTENCODING_MACJAPANESE :
2228 enc = kCFStringEncodingMacJapanese ;
2229 break ;
2230 case wxFONTENCODING_MACCHINESETRAD :
2231 enc = kCFStringEncodingMacChineseTrad ;
2232 break ;
2233 case wxFONTENCODING_MACKOREAN :
2234 enc = kCFStringEncodingMacKorean ;
2235 break ;
2236 case wxFONTENCODING_MACARABIC :
2237 enc = kCFStringEncodingMacArabic ;
2238 break ;
2239 case wxFONTENCODING_MACHEBREW :
2240 enc = kCFStringEncodingMacHebrew ;
2241 break ;
2242 case wxFONTENCODING_MACGREEK :
2243 enc = kCFStringEncodingMacGreek ;
2244 break ;
2245 case wxFONTENCODING_MACCYRILLIC :
2246 enc = kCFStringEncodingMacCyrillic ;
2247 break ;
2248 case wxFONTENCODING_MACDEVANAGARI :
2249 enc = kCFStringEncodingMacDevanagari ;
2250 break ;
2251 case wxFONTENCODING_MACGURMUKHI :
2252 enc = kCFStringEncodingMacGurmukhi ;
2253 break ;
2254 case wxFONTENCODING_MACGUJARATI :
2255 enc = kCFStringEncodingMacGujarati ;
2256 break ;
2257 case wxFONTENCODING_MACORIYA :
2258 enc = kCFStringEncodingMacOriya ;
2259 break ;
2260 case wxFONTENCODING_MACBENGALI :
2261 enc = kCFStringEncodingMacBengali ;
2262 break ;
2263 case wxFONTENCODING_MACTAMIL :
2264 enc = kCFStringEncodingMacTamil ;
2265 break ;
2266 case wxFONTENCODING_MACTELUGU :
2267 enc = kCFStringEncodingMacTelugu ;
2268 break ;
2269 case wxFONTENCODING_MACKANNADA :
2270 enc = kCFStringEncodingMacKannada ;
2271 break ;
2272 case wxFONTENCODING_MACMALAJALAM :
2273 enc = kCFStringEncodingMacMalayalam ;
2274 break ;
2275 case wxFONTENCODING_MACSINHALESE :
2276 enc = kCFStringEncodingMacSinhalese ;
2277 break ;
2278 case wxFONTENCODING_MACBURMESE :
2279 enc = kCFStringEncodingMacBurmese ;
2280 break ;
2281 case wxFONTENCODING_MACKHMER :
2282 enc = kCFStringEncodingMacKhmer ;
2283 break ;
2284 case wxFONTENCODING_MACTHAI :
2285 enc = kCFStringEncodingMacThai ;
2286 break ;
2287 case wxFONTENCODING_MACLAOTIAN :
2288 enc = kCFStringEncodingMacLaotian ;
2289 break ;
2290 case wxFONTENCODING_MACGEORGIAN :
2291 enc = kCFStringEncodingMacGeorgian ;
2292 break ;
2293 case wxFONTENCODING_MACARMENIAN :
2294 enc = kCFStringEncodingMacArmenian ;
2295 break ;
2296 case wxFONTENCODING_MACCHINESESIMP :
2297 enc = kCFStringEncodingMacChineseSimp ;
2298 break ;
2299 case wxFONTENCODING_MACTIBETAN :
2300 enc = kCFStringEncodingMacTibetan ;
2301 break ;
2302 case wxFONTENCODING_MACMONGOLIAN :
2303 enc = kCFStringEncodingMacMongolian ;
2304 break ;
2305 case wxFONTENCODING_MACETHIOPIC :
2306 enc = kCFStringEncodingMacEthiopic ;
2307 break ;
2308 case wxFONTENCODING_MACCENTRALEUR :
2309 enc = kCFStringEncodingMacCentralEurRoman ;
2310 break ;
2311 case wxFONTENCODING_MACVIATNAMESE :
2312 enc = kCFStringEncodingMacVietnamese ;
2313 break ;
2314 case wxFONTENCODING_MACARABICEXT :
2315 enc = kCFStringEncodingMacExtArabic ;
2316 break ;
2317 case wxFONTENCODING_MACSYMBOL :
2318 enc = kCFStringEncodingMacSymbol ;
2319 break ;
2320 case wxFONTENCODING_MACDINGBATS :
2321 enc = kCFStringEncodingMacDingbats ;
2322 break ;
2323 case wxFONTENCODING_MACTURKISH :
2324 enc = kCFStringEncodingMacTurkish ;
2325 break ;
2326 case wxFONTENCODING_MACCROATIAN :
2327 enc = kCFStringEncodingMacCroatian ;
2328 break ;
2329 case wxFONTENCODING_MACICELANDIC :
2330 enc = kCFStringEncodingMacIcelandic ;
2331 break ;
2332 case wxFONTENCODING_MACROMANIAN :
2333 enc = kCFStringEncodingMacRomanian ;
2334 break ;
2335 case wxFONTENCODING_MACCELTIC :
2336 enc = kCFStringEncodingMacCeltic ;
2337 break ;
2338 case wxFONTENCODING_MACGAELIC :
2339 enc = kCFStringEncodingMacGaelic ;
2340 break ;
2341 // case wxFONTENCODING_MACKEYBOARD :
2342 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2343 // break ;
2344 default :
2345 // because gcc is picky
2346 break ;
2347 } ;
2348 return enc ;
2349 }
2350
2351 class wxMBConv_cocoa : public wxMBConv
2352 {
2353 public:
2354 wxMBConv_cocoa()
2355 {
2356 Init(CFStringGetSystemEncoding()) ;
2357 }
2358
2359 #if wxUSE_FONTMAP
2360 wxMBConv_cocoa(const wxChar* name)
2361 {
2362 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2363 }
2364 #endif
2365
2366 wxMBConv_cocoa(wxFontEncoding encoding)
2367 {
2368 Init( wxCFStringEncFromFontEnc(encoding) );
2369 }
2370
2371 ~wxMBConv_cocoa()
2372 {
2373 }
2374
2375 void Init( CFStringEncoding encoding)
2376 {
2377 m_encoding = encoding ;
2378 }
2379
2380 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2381 {
2382 wxASSERT(szUnConv);
2383
2384 CFStringRef theString = CFStringCreateWithBytes (
2385 NULL, //the allocator
2386 (const UInt8*)szUnConv,
2387 strlen(szUnConv),
2388 m_encoding,
2389 false //no BOM/external representation
2390 );
2391
2392 wxASSERT(theString);
2393
2394 size_t nOutLength = CFStringGetLength(theString);
2395
2396 if (szOut == NULL)
2397 {
2398 CFRelease(theString);
2399 return nOutLength;
2400 }
2401
2402 CFRange theRange = { 0, nOutSize };
2403
2404 #if SIZEOF_WCHAR_T == 4
2405 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2406 #endif
2407
2408 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2409
2410 CFRelease(theString);
2411
2412 szUniCharBuffer[nOutLength] = '\0' ;
2413
2414 #if SIZEOF_WCHAR_T == 4
2415 wxMBConvUTF16 converter ;
2416 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2417 delete[] szUniCharBuffer;
2418 #endif
2419
2420 return nOutLength;
2421 }
2422
2423 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2424 {
2425 wxASSERT(szUnConv);
2426
2427 size_t nRealOutSize;
2428 size_t nBufSize = wxWcslen(szUnConv);
2429 UniChar* szUniBuffer = (UniChar*) szUnConv;
2430
2431 #if SIZEOF_WCHAR_T == 4
2432 wxMBConvUTF16 converter ;
2433 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2434 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2435 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2436 nBufSize /= sizeof(UniChar);
2437 #endif
2438
2439 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2440 NULL, //allocator
2441 szUniBuffer,
2442 nBufSize,
2443 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2444 );
2445
2446 wxASSERT(theString);
2447
2448 //Note that CER puts a BOM when converting to unicode
2449 //so we check and use getchars instead in that case
2450 if (m_encoding == kCFStringEncodingUnicode)
2451 {
2452 if (szOut != NULL)
2453 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2454
2455 nRealOutSize = CFStringGetLength(theString) + 1;
2456 }
2457 else
2458 {
2459 CFStringGetBytes(
2460 theString,
2461 CFRangeMake(0, CFStringGetLength(theString)),
2462 m_encoding,
2463 0, //what to put in characters that can't be converted -
2464 //0 tells CFString to return NULL if it meets such a character
2465 false, //not an external representation
2466 (UInt8*) szOut,
2467 nOutSize,
2468 (CFIndex*) &nRealOutSize
2469 );
2470 }
2471
2472 CFRelease(theString);
2473
2474 #if SIZEOF_WCHAR_T == 4
2475 delete[] szUniBuffer;
2476 #endif
2477
2478 return nRealOutSize - 1;
2479 }
2480
2481 bool IsOk() const
2482 {
2483 return m_encoding != kCFStringEncodingInvalidId &&
2484 CFStringIsEncodingAvailable(m_encoding);
2485 }
2486
2487 private:
2488 CFStringEncoding m_encoding ;
2489 };
2490
2491 #endif // defined(__WXCOCOA__)
2492
2493 // ============================================================================
2494 // Mac conversion classes
2495 // ============================================================================
2496
2497 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2498
2499 class wxMBConv_mac : public wxMBConv
2500 {
2501 public:
2502 wxMBConv_mac()
2503 {
2504 Init(CFStringGetSystemEncoding()) ;
2505 }
2506
2507 #if wxUSE_FONTMAP
2508 wxMBConv_mac(const wxChar* name)
2509 {
2510 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2511 }
2512 #endif
2513
2514 wxMBConv_mac(wxFontEncoding encoding)
2515 {
2516 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2517 }
2518
2519 ~wxMBConv_mac()
2520 {
2521 OSStatus status = noErr ;
2522 status = TECDisposeConverter(m_MB2WC_converter);
2523 status = TECDisposeConverter(m_WC2MB_converter);
2524 }
2525
2526
2527 void Init( TextEncodingBase encoding)
2528 {
2529 OSStatus status = noErr ;
2530 m_char_encoding = encoding ;
2531 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2532
2533 status = TECCreateConverter(&m_MB2WC_converter,
2534 m_char_encoding,
2535 m_unicode_encoding);
2536 status = TECCreateConverter(&m_WC2MB_converter,
2537 m_unicode_encoding,
2538 m_char_encoding);
2539 }
2540
2541 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2542 {
2543 OSStatus status = noErr ;
2544 ByteCount byteOutLen ;
2545 ByteCount byteInLen = strlen(psz) ;
2546 wchar_t *tbuf = NULL ;
2547 UniChar* ubuf = NULL ;
2548 size_t res = 0 ;
2549
2550 if (buf == NULL)
2551 {
2552 //apple specs say at least 32
2553 n = wxMax( 32 , byteInLen ) ;
2554 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2555 }
2556 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2557 #if SIZEOF_WCHAR_T == 4
2558 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2559 #else
2560 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2561 #endif
2562 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2563 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2564 #if SIZEOF_WCHAR_T == 4
2565 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2566 // is not properly terminated we get random characters at the end
2567 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2568 wxMBConvUTF16 converter ;
2569 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2570 free( ubuf ) ;
2571 #else
2572 res = byteOutLen / sizeof( UniChar ) ;
2573 #endif
2574 if ( buf == NULL )
2575 free(tbuf) ;
2576
2577 if ( buf && res < n)
2578 buf[res] = 0;
2579
2580 return res ;
2581 }
2582
2583 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2584 {
2585 OSStatus status = noErr ;
2586 ByteCount byteOutLen ;
2587 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2588
2589 char *tbuf = NULL ;
2590
2591 if (buf == NULL)
2592 {
2593 //apple specs say at least 32
2594 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2595 tbuf = (char*) malloc( n ) ;
2596 }
2597
2598 ByteCount byteBufferLen = n ;
2599 UniChar* ubuf = NULL ;
2600 #if SIZEOF_WCHAR_T == 4
2601 wxMBConvUTF16 converter ;
2602 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2603 byteInLen = unicharlen ;
2604 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2605 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2606 #else
2607 ubuf = (UniChar*) psz ;
2608 #endif
2609 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2610 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2611 #if SIZEOF_WCHAR_T == 4
2612 free( ubuf ) ;
2613 #endif
2614 if ( buf == NULL )
2615 free(tbuf) ;
2616
2617 size_t res = byteOutLen ;
2618 if ( buf && res < n)
2619 {
2620 buf[res] = 0;
2621
2622 //we need to double-trip to verify it didn't insert any ? in place
2623 //of bogus characters
2624 wxWCharBuffer wcBuf(n);
2625 size_t pszlen = wxWcslen(psz);
2626 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2627 wxWcslen(wcBuf) != pszlen ||
2628 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2629 {
2630 // we didn't obtain the same thing we started from, hence
2631 // the conversion was lossy and we consider that it failed
2632 return (size_t)-1;
2633 }
2634 }
2635
2636 return res ;
2637 }
2638
2639 bool IsOk() const
2640 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2641
2642 private:
2643 TECObjectRef m_MB2WC_converter ;
2644 TECObjectRef m_WC2MB_converter ;
2645
2646 TextEncodingBase m_char_encoding ;
2647 TextEncodingBase m_unicode_encoding ;
2648 };
2649
2650 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2651
2652 // ============================================================================
2653 // wxEncodingConverter based conversion classes
2654 // ============================================================================
2655
2656 #if wxUSE_FONTMAP
2657
2658 class wxMBConv_wxwin : public wxMBConv
2659 {
2660 private:
2661 void Init()
2662 {
2663 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2664 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2665 }
2666
2667 public:
2668 // temporarily just use wxEncodingConverter stuff,
2669 // so that it works while a better implementation is built
2670 wxMBConv_wxwin(const wxChar* name)
2671 {
2672 if (name)
2673 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2674 else
2675 m_enc = wxFONTENCODING_SYSTEM;
2676
2677 Init();
2678 }
2679
2680 wxMBConv_wxwin(wxFontEncoding enc)
2681 {
2682 m_enc = enc;
2683
2684 Init();
2685 }
2686
2687 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2688 {
2689 size_t inbuf = strlen(psz);
2690 if (buf)
2691 {
2692 if (!m2w.Convert(psz,buf))
2693 return (size_t)-1;
2694 }
2695 return inbuf;
2696 }
2697
2698 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2699 {
2700 const size_t inbuf = wxWcslen(psz);
2701 if (buf)
2702 {
2703 if (!w2m.Convert(psz,buf))
2704 return (size_t)-1;
2705 }
2706
2707 return inbuf;
2708 }
2709
2710 virtual size_t GetMBNulLen() const
2711 {
2712 switch ( m_enc )
2713 {
2714 case wxFONTENCODING_UTF16BE:
2715 case wxFONTENCODING_UTF16LE:
2716 return 2;
2717
2718 case wxFONTENCODING_UTF32BE:
2719 case wxFONTENCODING_UTF32LE:
2720 return 4;
2721
2722 default:
2723 return 1;
2724 }
2725 }
2726
2727 bool IsOk() const { return m_ok; }
2728
2729 public:
2730 wxFontEncoding m_enc;
2731 wxEncodingConverter m2w, w2m;
2732
2733 private:
2734 // were we initialized successfully?
2735 bool m_ok;
2736
2737 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2738 };
2739
2740 // make the constructors available for unit testing
2741 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2742 {
2743 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2744 if ( !result->IsOk() )
2745 {
2746 delete result;
2747 return 0;
2748 }
2749 return result;
2750 }
2751
2752 #endif // wxUSE_FONTMAP
2753
2754 // ============================================================================
2755 // wxCSConv implementation
2756 // ============================================================================
2757
2758 void wxCSConv::Init()
2759 {
2760 m_name = NULL;
2761 m_convReal = NULL;
2762 m_deferred = true;
2763 }
2764
2765 wxCSConv::wxCSConv(const wxChar *charset)
2766 {
2767 Init();
2768
2769 if ( charset )
2770 {
2771 SetName(charset);
2772 }
2773
2774 #if wxUSE_FONTMAP
2775 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2776 #else
2777 m_encoding = wxFONTENCODING_SYSTEM;
2778 #endif
2779 }
2780
2781 wxCSConv::wxCSConv(wxFontEncoding encoding)
2782 {
2783 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2784 {
2785 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2786
2787 encoding = wxFONTENCODING_SYSTEM;
2788 }
2789
2790 Init();
2791
2792 m_encoding = encoding;
2793 }
2794
2795 wxCSConv::~wxCSConv()
2796 {
2797 Clear();
2798 }
2799
2800 wxCSConv::wxCSConv(const wxCSConv& conv)
2801 : wxMBConv()
2802 {
2803 Init();
2804
2805 SetName(conv.m_name);
2806 m_encoding = conv.m_encoding;
2807 }
2808
2809 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2810 {
2811 Clear();
2812
2813 SetName(conv.m_name);
2814 m_encoding = conv.m_encoding;
2815
2816 return *this;
2817 }
2818
2819 void wxCSConv::Clear()
2820 {
2821 free(m_name);
2822 delete m_convReal;
2823
2824 m_name = NULL;
2825 m_convReal = NULL;
2826 }
2827
2828 void wxCSConv::SetName(const wxChar *charset)
2829 {
2830 if (charset)
2831 {
2832 m_name = wxStrdup(charset);
2833 m_deferred = true;
2834 }
2835 }
2836
2837 #if wxUSE_FONTMAP
2838 #include "wx/hashmap.h"
2839
2840 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2841 wxEncodingNameCache );
2842
2843 static wxEncodingNameCache gs_nameCache;
2844 #endif
2845
2846 wxMBConv *wxCSConv::DoCreate() const
2847 {
2848 #if wxUSE_FONTMAP
2849 wxLogTrace(TRACE_STRCONV,
2850 wxT("creating conversion for %s"),
2851 (m_name ? m_name
2852 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2853 #endif // wxUSE_FONTMAP
2854
2855 // check for the special case of ASCII or ISO8859-1 charset: as we have
2856 // special knowledge of it anyhow, we don't need to create a special
2857 // conversion object
2858 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2859 m_encoding == wxFONTENCODING_DEFAULT )
2860 {
2861 // don't convert at all
2862 return NULL;
2863 }
2864
2865 // we trust OS to do conversion better than we can so try external
2866 // conversion methods first
2867 //
2868 // the full order is:
2869 // 1. OS conversion (iconv() under Unix or Win32 API)
2870 // 2. hard coded conversions for UTF
2871 // 3. wxEncodingConverter as fall back
2872
2873 // step (1)
2874 #ifdef HAVE_ICONV
2875 #if !wxUSE_FONTMAP
2876 if ( m_name )
2877 #endif // !wxUSE_FONTMAP
2878 {
2879 wxString name(m_name);
2880 wxFontEncoding encoding(m_encoding);
2881
2882 if ( !name.empty() )
2883 {
2884 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2885 if ( conv->IsOk() )
2886 return conv;
2887
2888 delete conv;
2889
2890 #if wxUSE_FONTMAP
2891 encoding =
2892 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2893 #endif // wxUSE_FONTMAP
2894 }
2895 #if wxUSE_FONTMAP
2896 {
2897 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2898 if ( it != gs_nameCache.end() )
2899 {
2900 if ( it->second.empty() )
2901 return NULL;
2902
2903 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2904 if ( conv->IsOk() )
2905 return conv;
2906
2907 delete conv;
2908 }
2909
2910 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2911
2912 for ( ; *names; ++names )
2913 {
2914 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2915 if ( conv->IsOk() )
2916 {
2917 gs_nameCache[encoding] = *names;
2918 return conv;
2919 }
2920
2921 delete conv;
2922 }
2923
2924 gs_nameCache[encoding] = _T(""); // cache the failure
2925 }
2926 #endif // wxUSE_FONTMAP
2927 }
2928 #endif // HAVE_ICONV
2929
2930 #ifdef wxHAVE_WIN32_MB2WC
2931 {
2932 #if wxUSE_FONTMAP
2933 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2934 : new wxMBConv_win32(m_encoding);
2935 if ( conv->IsOk() )
2936 return conv;
2937
2938 delete conv;
2939 #else
2940 return NULL;
2941 #endif
2942 }
2943 #endif // wxHAVE_WIN32_MB2WC
2944 #if defined(__WXMAC__)
2945 {
2946 // leave UTF16 and UTF32 to the built-ins of wx
2947 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2948 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2949 {
2950
2951 #if wxUSE_FONTMAP
2952 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2953 : new wxMBConv_mac(m_encoding);
2954 #else
2955 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2956 #endif
2957 if ( conv->IsOk() )
2958 return conv;
2959
2960 delete conv;
2961 }
2962 }
2963 #endif
2964 #if defined(__WXCOCOA__)
2965 {
2966 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2967 {
2968
2969 #if wxUSE_FONTMAP
2970 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2971 : new wxMBConv_cocoa(m_encoding);
2972 #else
2973 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2974 #endif
2975 if ( conv->IsOk() )
2976 return conv;
2977
2978 delete conv;
2979 }
2980 }
2981 #endif
2982 // step (2)
2983 wxFontEncoding enc = m_encoding;
2984 #if wxUSE_FONTMAP
2985 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2986 {
2987 // use "false" to suppress interactive dialogs -- we can be called from
2988 // anywhere and popping up a dialog from here is the last thing we want to
2989 // do
2990 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2991 }
2992 #endif // wxUSE_FONTMAP
2993
2994 switch ( enc )
2995 {
2996 case wxFONTENCODING_UTF7:
2997 return new wxMBConvUTF7;
2998
2999 case wxFONTENCODING_UTF8:
3000 return new wxMBConvUTF8;
3001
3002 case wxFONTENCODING_UTF16BE:
3003 return new wxMBConvUTF16BE;
3004
3005 case wxFONTENCODING_UTF16LE:
3006 return new wxMBConvUTF16LE;
3007
3008 case wxFONTENCODING_UTF32BE:
3009 return new wxMBConvUTF32BE;
3010
3011 case wxFONTENCODING_UTF32LE:
3012 return new wxMBConvUTF32LE;
3013
3014 default:
3015 // nothing to do but put here to suppress gcc warnings
3016 ;
3017 }
3018
3019 // step (3)
3020 #if wxUSE_FONTMAP
3021 {
3022 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3023 : new wxMBConv_wxwin(m_encoding);
3024 if ( conv->IsOk() )
3025 return conv;
3026
3027 delete conv;
3028 }
3029 #endif // wxUSE_FONTMAP
3030
3031 // NB: This is a hack to prevent deadlock. What could otherwise happen
3032 // in Unicode build: wxConvLocal creation ends up being here
3033 // because of some failure and logs the error. But wxLog will try to
3034 // attach timestamp, for which it will need wxConvLocal (to convert
3035 // time to char* and then wchar_t*), but that fails, tries to log
3036 // error, but wxLog has a (already locked) critical section that
3037 // guards static buffer.
3038 static bool alreadyLoggingError = false;
3039 if (!alreadyLoggingError)
3040 {
3041 alreadyLoggingError = true;
3042 wxLogError(_("Cannot convert from the charset '%s'!"),
3043 m_name ? m_name
3044 :
3045 #if wxUSE_FONTMAP
3046 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3047 #else // !wxUSE_FONTMAP
3048 wxString::Format(_("encoding %s"), m_encoding).c_str()
3049 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3050 );
3051 alreadyLoggingError = false;
3052 }
3053
3054 return NULL;
3055 }
3056
3057 void wxCSConv::CreateConvIfNeeded() const
3058 {
3059 if ( m_deferred )
3060 {
3061 wxCSConv *self = (wxCSConv *)this; // const_cast
3062
3063 #if wxUSE_INTL
3064 // if we don't have neither the name nor the encoding, use the default
3065 // encoding for this system
3066 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3067 {
3068 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3069 }
3070 #endif // wxUSE_INTL
3071
3072 self->m_convReal = DoCreate();
3073 self->m_deferred = false;
3074 }
3075 }
3076
3077 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3078 {
3079 CreateConvIfNeeded();
3080
3081 if (m_convReal)
3082 return m_convReal->MB2WC(buf, psz, n);
3083
3084 // latin-1 (direct)
3085 size_t len = strlen(psz);
3086
3087 if (buf)
3088 {
3089 for (size_t c = 0; c <= len; c++)
3090 buf[c] = (unsigned char)(psz[c]);
3091 }
3092
3093 return len;
3094 }
3095
3096 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3097 {
3098 CreateConvIfNeeded();
3099
3100 if (m_convReal)
3101 return m_convReal->WC2MB(buf, psz, n);
3102
3103 // latin-1 (direct)
3104 const size_t len = wxWcslen(psz);
3105 if (buf)
3106 {
3107 for (size_t c = 0; c <= len; c++)
3108 {
3109 if (psz[c] > 0xFF)
3110 return (size_t)-1;
3111 buf[c] = (char)psz[c];
3112 }
3113 }
3114 else
3115 {
3116 for (size_t c = 0; c <= len; c++)
3117 {
3118 if (psz[c] > 0xFF)
3119 return (size_t)-1;
3120 }
3121 }
3122
3123 return len;
3124 }
3125
3126 size_t wxCSConv::GetMBNulLen() const
3127 {
3128 CreateConvIfNeeded();
3129
3130 if ( m_convReal )
3131 {
3132 return m_convReal->GetMBNulLen();
3133 }
3134
3135 return 1;
3136 }
3137
3138 // ----------------------------------------------------------------------------
3139 // globals
3140 // ----------------------------------------------------------------------------
3141
3142 #ifdef __WINDOWS__
3143 static wxMBConv_win32 wxConvLibcObj;
3144 #elif defined(__WXMAC__) && !defined(__MACH__)
3145 static wxMBConv_mac wxConvLibcObj ;
3146 #else
3147 static wxMBConvLibc wxConvLibcObj;
3148 #endif
3149
3150 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3151 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3152 static wxMBConvUTF7 wxConvUTF7Obj;
3153 static wxMBConvUTF8 wxConvUTF8Obj;
3154
3155 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3156 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3157 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3158 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3159 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3160 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3161 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3162 #ifdef __WXOSX__
3163 wxConvUTF8Obj;
3164 #else
3165 wxConvLibcObj;
3166 #endif
3167
3168
3169 #else // !wxUSE_WCHAR_T
3170
3171 // stand-ins in absence of wchar_t
3172 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3173 wxConvISO8859_1,
3174 wxConvLocal,
3175 wxConvUTF8;
3176
3177 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T