]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
added wxMBConv::Clone() to be able to copy conversion objects polymorphically
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
25
26 #ifdef __BORLANDC__
27 #pragma hdrstop
28 #endif
29
30 #ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33 #endif // WX_PRECOMP
34
35 #include "wx/strconv.h"
36
37 #if wxUSE_WCHAR_T
38
39 #ifdef __WINDOWS__
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #endif
43
44 #ifndef __WXWINCE__
45 #include <errno.h>
46 #endif
47
48 #include <ctype.h>
49 #include <string.h>
50 #include <stdlib.h>
51
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
55
56 #ifdef __SALFORDC__
57 #include <clib.h>
58 #endif
59
60 #ifdef HAVE_ICONV
61 #include <iconv.h>
62 #include "wx/thread.h"
63 #endif
64
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
67 #include "wx/utils.h"
68
69 #ifdef __WXMAC__
70 #ifndef __DARWIN__
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
74 #endif
75
76 #include "wx/mac/private.h" // includes mac headers
77 #endif
78
79 #define TRACE_STRCONV _T("strconv")
80
81 #if SIZEOF_WCHAR_T == 2
82 #define WC_UTF16
83 #endif
84
85 // ============================================================================
86 // implementation
87 // ============================================================================
88
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p, size_t n)
91 {
92 while ( n && *p++ == '\0' )
93 n--;
94
95 return n != 0;
96 }
97
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4
100 // ----------------------------------------------------------------------------
101
102
103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
104 {
105 if (input<=0xffff)
106 {
107 if (output)
108 *output = (wxUint16) input;
109 return 1;
110 }
111 else if (input>=0x110000)
112 {
113 return (size_t)-1;
114 }
115 else
116 {
117 if (output)
118 {
119 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
120 *output = (wxUint16) ((input&0x3ff)+0xdc00);
121 }
122 return 2;
123 }
124 }
125
126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
127 {
128 if ((*input<0xd800) || (*input>0xdfff))
129 {
130 output = *input;
131 return 1;
132 }
133 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
134 {
135 output = *input;
136 return (size_t)-1;
137 }
138 else
139 {
140 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
141 return 2;
142 }
143 }
144
145
146 // ----------------------------------------------------------------------------
147 // wxMBConv
148 // ----------------------------------------------------------------------------
149
150 size_t
151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
152 const char *src, size_t srcLen) const
153 {
154 // although new conversion classes are supposed to implement this function
155 // directly, the existins ones only implement the old MB2WC() and so, to
156 // avoid to have to rewrite all conversion classes at once, we provide a
157 // default (but not efficient) implementation of this one in terms of the
158 // old function by copying the input to ensure that it's NUL-terminated and
159 // then using MB2WC() to convert it
160
161 // the number of chars [which would be] written to dst [if it were not NULL]
162 size_t dstWritten = 0;
163
164 // the number of NULs terminating this string
165 size_t nulLen wxDUMMY_INITIALIZE(0);
166
167 // if we were not given the input size we just have to assume that the
168 // string is properly terminated as we have no way of knowing how long it
169 // is anyhow, but if we do have the size check whether there are enough
170 // NULs at the end
171 wxCharBuffer bufTmp;
172 const char *srcEnd;
173 if ( srcLen != (size_t)-1 )
174 {
175 // we need to know how to find the end of this string
176 nulLen = GetMBNulLen();
177 if ( nulLen == wxCONV_FAILED )
178 return wxCONV_FAILED;
179
180 // if there are enough NULs we can avoid the copy
181 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
182 {
183 // make a copy in order to properly NUL-terminate the string
184 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
185 char * const p = bufTmp.data();
186 memcpy(p, src, srcLen);
187 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
188 *s = '\0';
189
190 src = bufTmp;
191 }
192
193 srcEnd = src + srcLen;
194 }
195 else // quit after the first loop iteration
196 {
197 srcEnd = NULL;
198 }
199
200 for ( ;; )
201 {
202 // try to convert the current chunk
203 size_t lenChunk = MB2WC(NULL, src, 0);
204 if ( lenChunk == 0 )
205 {
206 // nothing left in the input string, conversion succeeded; but
207 // still account for the trailing NULL
208 dstWritten++;
209 break;
210 }
211
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
214
215 lenChunk++; // for trailing NUL
216
217 dstWritten += lenChunk;
218
219 if ( dst )
220 {
221 if ( dstWritten > dstLen )
222 return wxCONV_FAILED;
223
224 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
225 return wxCONV_FAILED;
226
227 dst += lenChunk;
228 }
229
230 if ( !srcEnd )
231 {
232 // we convert the entire string in this cas, as we suppose that the
233 // string is NUL-terminated and so srcEnd is not used at all
234 break;
235 }
236
237 // advance the input pointer past the end of this chunk
238 while ( NotAllNULs(src, nulLen) )
239 {
240 // notice that we must skip over multiple bytes here as we suppose
241 // that if NUL takes 2 or 4 bytes, then all the other characters do
242 // too and so if advanced by a single byte we might erroneously
243 // detect sequences of NUL bytes in the middle of the input
244 src += nulLen;
245 }
246
247 src += nulLen; // skipping over its terminator as well
248
249 // note that ">=" (and not just "==") is needed here as the terminator
250 // we skipped just above could be inside or just after the buffer
251 // delimited by inEnd
252 if ( src >= srcEnd )
253 break;
254 }
255
256 return dstWritten;
257 }
258
259 size_t
260 wxMBConv::FromWChar(char *dst, size_t dstLen,
261 const wchar_t *src, size_t srcLen) const
262 {
263 // the number of chars [which would be] written to dst [if it were not NULL]
264 size_t dstWritten = 0;
265
266 // make a copy of the input string unless it is already properly
267 // NUL-terminated
268 //
269 // if we don't know its length we have no choice but to assume that it is,
270 // indeed, properly terminated
271 wxWCharBuffer bufTmp;
272 if ( srcLen == (size_t)-1 )
273 {
274 srcLen = wxWcslen(src) + 1;
275 }
276 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
277 {
278 // make a copy in order to properly NUL-terminate the string
279 bufTmp = wxWCharBuffer(srcLen);
280 memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
281 src = bufTmp;
282 }
283
284 const size_t lenNul = GetMBNulLen();
285 for ( const wchar_t * const srcEnd = src + srcLen;
286 src < srcEnd;
287 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
288 {
289 // try to convert the current chunk
290 size_t lenChunk = WC2MB(NULL, src, 0);
291
292 if ( lenChunk == wxCONV_FAILED )
293 return wxCONV_FAILED;
294
295 lenChunk += lenNul;
296 dstWritten += lenChunk;
297
298 if ( dst )
299 {
300 if ( dstWritten > dstLen )
301 return wxCONV_FAILED;
302
303 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
304 return wxCONV_FAILED;
305
306 dst += lenChunk;
307 }
308 }
309
310 return dstWritten;
311 }
312
313 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
314 {
315 size_t rc = ToWChar(out, outLen, in);
316 if ( rc != wxCONV_FAILED )
317 {
318 // ToWChar() returns the buffer length, i.e. including the trailing
319 // NUL, while this method doesn't take it into account
320 rc--;
321 }
322
323 return rc;
324 }
325
326 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
327 {
328 size_t rc = FromWChar(out, outLen, in);
329 if ( rc != wxCONV_FAILED )
330 {
331 rc -= GetMBNulLen();
332 }
333
334 return rc;
335 }
336
337 wxMBConv::~wxMBConv()
338 {
339 // nothing to do here (necessary for Darwin linking probably)
340 }
341
342 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
343 {
344 if ( psz )
345 {
346 // calculate the length of the buffer needed first
347 const size_t nLen = MB2WC(NULL, psz, 0);
348 if ( nLen != wxCONV_FAILED )
349 {
350 // now do the actual conversion
351 wxWCharBuffer buf(nLen /* +1 added implicitly */);
352
353 // +1 for the trailing NULL
354 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
355 return buf;
356 }
357 }
358
359 return wxWCharBuffer();
360 }
361
362 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
363 {
364 if ( pwz )
365 {
366 const size_t nLen = WC2MB(NULL, pwz, 0);
367 if ( nLen != wxCONV_FAILED )
368 {
369 // extra space for trailing NUL(s)
370 static const size_t extraLen = GetMaxMBNulLen();
371
372 wxCharBuffer buf(nLen + extraLen - 1);
373 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
374 return buf;
375 }
376 }
377
378 return wxCharBuffer();
379 }
380
381 const wxWCharBuffer
382 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
383 {
384 const size_t dstLen = ToWChar(NULL, 0, in, inLen);
385 if ( dstLen != wxCONV_FAILED )
386 {
387 wxWCharBuffer wbuf(dstLen - 1);
388 if ( ToWChar(wbuf.data(), dstLen, in, inLen) )
389 {
390 if ( outLen )
391 *outLen = dstLen - 1;
392 return wbuf;
393 }
394 }
395
396 if ( outLen )
397 *outLen = 0;
398
399 return wxWCharBuffer();
400 }
401
402 const wxCharBuffer
403 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
404 {
405 const size_t dstLen = FromWChar(NULL, 0, in, inLen);
406 if ( dstLen != wxCONV_FAILED )
407 {
408 wxCharBuffer buf(dstLen - 1);
409 if ( FromWChar(buf.data(), dstLen, in, inLen) )
410 {
411 if ( outLen )
412 *outLen = dstLen - 1;
413 return buf;
414 }
415 }
416
417 if ( outLen )
418 *outLen = 0;
419
420 return wxCharBuffer();
421 }
422
423 // ----------------------------------------------------------------------------
424 // wxMBConvLibc
425 // ----------------------------------------------------------------------------
426
427 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
428 {
429 return wxMB2WC(buf, psz, n);
430 }
431
432 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
433 {
434 return wxWC2MB(buf, psz, n);
435 }
436
437 // ----------------------------------------------------------------------------
438 // wxConvBrokenFileNames
439 // ----------------------------------------------------------------------------
440
441 #ifdef __UNIX__
442
443 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
444 {
445 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
446 || wxStricmp(charset, _T("UTF8")) == 0 )
447 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
448 else
449 m_conv = new wxCSConv(charset);
450 }
451
452 #endif // __UNIX__
453
454 // ----------------------------------------------------------------------------
455 // UTF-7
456 // ----------------------------------------------------------------------------
457
458 // Implementation (C) 2004 Fredrik Roubert
459
460 //
461 // BASE64 decoding table
462 //
463 static const unsigned char utf7unb64[] =
464 {
465 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
466 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
467 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
468 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
469 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
470 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
471 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
472 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
473 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
474 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
475 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
476 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
477 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
478 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
479 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
480 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
481 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
482 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
483 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
484 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
485 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
486 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
487 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
488 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
489 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
490 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
497 };
498
499 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
500 {
501 size_t len = 0;
502
503 while ( *psz && (!buf || (len < n)) )
504 {
505 unsigned char cc = *psz++;
506 if (cc != '+')
507 {
508 // plain ASCII char
509 if (buf)
510 *buf++ = cc;
511 len++;
512 }
513 else if (*psz == '-')
514 {
515 // encoded plus sign
516 if (buf)
517 *buf++ = cc;
518 len++;
519 psz++;
520 }
521 else // start of BASE64 encoded string
522 {
523 bool lsb, ok;
524 unsigned int d, l;
525 for ( ok = lsb = false, d = 0, l = 0;
526 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
527 psz++ )
528 {
529 d <<= 6;
530 d += cc;
531 for (l += 6; l >= 8; lsb = !lsb)
532 {
533 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
534 if (lsb)
535 {
536 if (buf)
537 *buf++ |= c;
538 len ++;
539 }
540 else
541 {
542 if (buf)
543 *buf = (wchar_t)(c << 8);
544 }
545
546 ok = true;
547 }
548 }
549
550 if ( !ok )
551 {
552 // in valid UTF7 we should have valid characters after '+'
553 return (size_t)-1;
554 }
555
556 if (*psz == '-')
557 psz++;
558 }
559 }
560
561 if ( buf && (len < n) )
562 *buf = '\0';
563
564 return len;
565 }
566
567 //
568 // BASE64 encoding table
569 //
570 static const unsigned char utf7enb64[] =
571 {
572 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
573 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
574 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
575 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
576 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
577 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
578 'w', 'x', 'y', 'z', '0', '1', '2', '3',
579 '4', '5', '6', '7', '8', '9', '+', '/'
580 };
581
582 //
583 // UTF-7 encoding table
584 //
585 // 0 - Set D (directly encoded characters)
586 // 1 - Set O (optional direct characters)
587 // 2 - whitespace characters (optional)
588 // 3 - special characters
589 //
590 static const unsigned char utf7encode[128] =
591 {
592 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
593 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
594 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
596 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
598 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
600 };
601
602 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
603 {
604 size_t len = 0;
605
606 while (*psz && ((!buf) || (len < n)))
607 {
608 wchar_t cc = *psz++;
609 if (cc < 0x80 && utf7encode[cc] < 1)
610 {
611 // plain ASCII char
612 if (buf)
613 *buf++ = (char)cc;
614 len++;
615 }
616 #ifndef WC_UTF16
617 else if (((wxUint32)cc) > 0xffff)
618 {
619 // no surrogate pair generation (yet?)
620 return (size_t)-1;
621 }
622 #endif
623 else
624 {
625 if (buf)
626 *buf++ = '+';
627 len++;
628 if (cc != '+')
629 {
630 // BASE64 encode string
631 unsigned int lsb, d, l;
632 for (d = 0, l = 0; /*nothing*/; psz++)
633 {
634 for (lsb = 0; lsb < 2; lsb ++)
635 {
636 d <<= 8;
637 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
638
639 for (l += 8; l >= 6; )
640 {
641 l -= 6;
642 if (buf)
643 *buf++ = utf7enb64[(d >> l) % 64];
644 len++;
645 }
646 }
647 cc = *psz;
648 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
649 break;
650 }
651 if (l != 0)
652 {
653 if (buf)
654 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
655 len++;
656 }
657 }
658 if (buf)
659 *buf++ = '-';
660 len++;
661 }
662 }
663 if (buf && (len < n))
664 *buf = 0;
665 return len;
666 }
667
668 // ----------------------------------------------------------------------------
669 // UTF-8
670 // ----------------------------------------------------------------------------
671
672 static wxUint32 utf8_max[]=
673 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
674
675 // boundaries of the private use area we use to (temporarily) remap invalid
676 // characters invalid in a UTF-8 encoded string
677 const wxUint32 wxUnicodePUA = 0x100000;
678 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
679
680 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
681 {
682 size_t len = 0;
683
684 while (*psz && ((!buf) || (len < n)))
685 {
686 const char *opsz = psz;
687 bool invalid = false;
688 unsigned char cc = *psz++, fc = cc;
689 unsigned cnt;
690 for (cnt = 0; fc & 0x80; cnt++)
691 fc <<= 1;
692 if (!cnt)
693 {
694 // plain ASCII char
695 if (buf)
696 *buf++ = cc;
697 len++;
698
699 // escape the escape character for octal escapes
700 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
701 && cc == '\\' && (!buf || len < n))
702 {
703 if (buf)
704 *buf++ = cc;
705 len++;
706 }
707 }
708 else
709 {
710 cnt--;
711 if (!cnt)
712 {
713 // invalid UTF-8 sequence
714 invalid = true;
715 }
716 else
717 {
718 unsigned ocnt = cnt - 1;
719 wxUint32 res = cc & (0x3f >> cnt);
720 while (cnt--)
721 {
722 cc = *psz;
723 if ((cc & 0xC0) != 0x80)
724 {
725 // invalid UTF-8 sequence
726 invalid = true;
727 break;
728 }
729 psz++;
730 res = (res << 6) | (cc & 0x3f);
731 }
732 if (invalid || res <= utf8_max[ocnt])
733 {
734 // illegal UTF-8 encoding
735 invalid = true;
736 }
737 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
738 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
739 {
740 // if one of our PUA characters turns up externally
741 // it must also be treated as an illegal sequence
742 // (a bit like you have to escape an escape character)
743 invalid = true;
744 }
745 else
746 {
747 #ifdef WC_UTF16
748 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
749 size_t pa = encode_utf16(res, (wxUint16 *)buf);
750 if (pa == (size_t)-1)
751 {
752 invalid = true;
753 }
754 else
755 {
756 if (buf)
757 buf += pa;
758 len += pa;
759 }
760 #else // !WC_UTF16
761 if (buf)
762 *buf++ = (wchar_t)res;
763 len++;
764 #endif // WC_UTF16/!WC_UTF16
765 }
766 }
767 if (invalid)
768 {
769 if (m_options & MAP_INVALID_UTF8_TO_PUA)
770 {
771 while (opsz < psz && (!buf || len < n))
772 {
773 #ifdef WC_UTF16
774 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
775 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
776 wxASSERT(pa != (size_t)-1);
777 if (buf)
778 buf += pa;
779 opsz++;
780 len += pa;
781 #else
782 if (buf)
783 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
784 opsz++;
785 len++;
786 #endif
787 }
788 }
789 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
790 {
791 while (opsz < psz && (!buf || len < n))
792 {
793 if ( buf && len + 3 < n )
794 {
795 unsigned char on = *opsz;
796 *buf++ = L'\\';
797 *buf++ = (wchar_t)( L'0' + on / 0100 );
798 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
799 *buf++ = (wchar_t)( L'0' + on % 010 );
800 }
801 opsz++;
802 len += 4;
803 }
804 }
805 else // MAP_INVALID_UTF8_NOT
806 {
807 return (size_t)-1;
808 }
809 }
810 }
811 }
812 if (buf && (len < n))
813 *buf = 0;
814 return len;
815 }
816
817 static inline bool isoctal(wchar_t wch)
818 {
819 return L'0' <= wch && wch <= L'7';
820 }
821
822 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
823 {
824 size_t len = 0;
825
826 while (*psz && ((!buf) || (len < n)))
827 {
828 wxUint32 cc;
829 #ifdef WC_UTF16
830 // cast is ok for WC_UTF16
831 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
832 psz += (pa == (size_t)-1) ? 1 : pa;
833 #else
834 cc=(*psz++) & 0x7fffffff;
835 #endif
836
837 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
838 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
839 {
840 if (buf)
841 *buf++ = (char)(cc - wxUnicodePUA);
842 len++;
843 }
844 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
845 && cc == L'\\' && psz[0] == L'\\' )
846 {
847 if (buf)
848 *buf++ = (char)cc;
849 psz++;
850 len++;
851 }
852 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
853 cc == L'\\' &&
854 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
855 {
856 if (buf)
857 {
858 *buf++ = (char) ((psz[0] - L'0')*0100 +
859 (psz[1] - L'0')*010 +
860 (psz[2] - L'0'));
861 }
862
863 psz += 3;
864 len++;
865 }
866 else
867 {
868 unsigned cnt;
869 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
870 if (!cnt)
871 {
872 // plain ASCII char
873 if (buf)
874 *buf++ = (char) cc;
875 len++;
876 }
877
878 else
879 {
880 len += cnt + 1;
881 if (buf)
882 {
883 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
884 while (cnt--)
885 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
886 }
887 }
888 }
889 }
890
891 if (buf && (len<n))
892 *buf = 0;
893
894 return len;
895 }
896
897 // ----------------------------------------------------------------------------
898 // UTF-16
899 // ----------------------------------------------------------------------------
900
901 #ifdef WORDS_BIGENDIAN
902 #define wxMBConvUTF16straight wxMBConvUTF16BE
903 #define wxMBConvUTF16swap wxMBConvUTF16LE
904 #else
905 #define wxMBConvUTF16swap wxMBConvUTF16BE
906 #define wxMBConvUTF16straight wxMBConvUTF16LE
907 #endif
908
909
910 #ifdef WC_UTF16
911
912 // copy 16bit MB to 16bit String
913 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
914 {
915 size_t len=0;
916
917 while (*(wxUint16*)psz && (!buf || len < n))
918 {
919 if (buf)
920 *buf++ = *(wxUint16*)psz;
921 len++;
922
923 psz += sizeof(wxUint16);
924 }
925 if (buf && len<n) *buf=0;
926
927 return len;
928 }
929
930
931 // copy 16bit String to 16bit MB
932 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
933 {
934 size_t len=0;
935
936 while (*psz && (!buf || len < n))
937 {
938 if (buf)
939 {
940 *(wxUint16*)buf = *psz;
941 buf += sizeof(wxUint16);
942 }
943 len += sizeof(wxUint16);
944 psz++;
945 }
946 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
947
948 return len;
949 }
950
951
952 // swap 16bit MB to 16bit String
953 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
954 {
955 size_t len = 0;
956
957 // UTF16 string must be terminated by 2 NULs as single NULs may occur
958 // inside the string
959 while ( (psz[0] || psz[1]) && (!buf || len < n) )
960 {
961 if ( buf )
962 {
963 ((char *)buf)[0] = psz[1];
964 ((char *)buf)[1] = psz[0];
965 buf++;
966 }
967 len++;
968 psz += 2;
969 }
970
971 if ( buf && len < n )
972 *buf = L'\0';
973
974 return len;
975 }
976
977
978 // swap 16bit MB to 16bit String
979 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
980 {
981 size_t len = 0;
982
983 while ( *psz && (!buf || len < n) )
984 {
985 if ( buf )
986 {
987 *buf++ = ((char*)psz)[1];
988 *buf++ = ((char*)psz)[0];
989 }
990 len += 2;
991 psz++;
992 }
993
994 if ( buf && len < n - 1 )
995 {
996 buf[0] =
997 buf[1] = '\0';
998 }
999
1000 return len;
1001 }
1002
1003
1004 #else // WC_UTF16
1005
1006
1007 // copy 16bit MB to 32bit String
1008 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1009 {
1010 size_t len=0;
1011
1012 while (*(wxUint16*)psz && (!buf || len < n))
1013 {
1014 wxUint32 cc;
1015 size_t pa=decode_utf16((wxUint16*)psz, cc);
1016 if (pa == (size_t)-1)
1017 return pa;
1018
1019 if (buf)
1020 *buf++ = (wchar_t)cc;
1021 len++;
1022 psz += pa * sizeof(wxUint16);
1023 }
1024 if (buf && len<n) *buf=0;
1025
1026 return len;
1027 }
1028
1029
1030 // copy 32bit String to 16bit MB
1031 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1032 {
1033 size_t len=0;
1034
1035 while (*psz && (!buf || len < n))
1036 {
1037 wxUint16 cc[2];
1038 size_t pa=encode_utf16(*psz, cc);
1039
1040 if (pa == (size_t)-1)
1041 return pa;
1042
1043 if (buf)
1044 {
1045 *(wxUint16*)buf = cc[0];
1046 buf += sizeof(wxUint16);
1047 if (pa > 1)
1048 {
1049 *(wxUint16*)buf = cc[1];
1050 buf += sizeof(wxUint16);
1051 }
1052 }
1053
1054 len += pa*sizeof(wxUint16);
1055 psz++;
1056 }
1057 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1058
1059 return len;
1060 }
1061
1062
1063 // swap 16bit MB to 32bit String
1064 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1065 {
1066 size_t len=0;
1067
1068 while (*(wxUint16*)psz && (!buf || len < n))
1069 {
1070 wxUint32 cc;
1071 char tmp[4];
1072 tmp[0]=psz[1]; tmp[1]=psz[0];
1073 tmp[2]=psz[3]; tmp[3]=psz[2];
1074
1075 size_t pa=decode_utf16((wxUint16*)tmp, cc);
1076 if (pa == (size_t)-1)
1077 return pa;
1078
1079 if (buf)
1080 *buf++ = (wchar_t)cc;
1081
1082 len++;
1083 psz += pa * sizeof(wxUint16);
1084 }
1085 if (buf && len<n) *buf=0;
1086
1087 return len;
1088 }
1089
1090
1091 // swap 32bit String to 16bit MB
1092 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1093 {
1094 size_t len=0;
1095
1096 while (*psz && (!buf || len < n))
1097 {
1098 wxUint16 cc[2];
1099 size_t pa=encode_utf16(*psz, cc);
1100
1101 if (pa == (size_t)-1)
1102 return pa;
1103
1104 if (buf)
1105 {
1106 *buf++ = ((char*)cc)[1];
1107 *buf++ = ((char*)cc)[0];
1108 if (pa > 1)
1109 {
1110 *buf++ = ((char*)cc)[3];
1111 *buf++ = ((char*)cc)[2];
1112 }
1113 }
1114
1115 len += pa*sizeof(wxUint16);
1116 psz++;
1117 }
1118 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1119
1120 return len;
1121 }
1122
1123 #endif // WC_UTF16
1124
1125
1126 // ----------------------------------------------------------------------------
1127 // UTF-32
1128 // ----------------------------------------------------------------------------
1129
1130 #ifdef WORDS_BIGENDIAN
1131 #define wxMBConvUTF32straight wxMBConvUTF32BE
1132 #define wxMBConvUTF32swap wxMBConvUTF32LE
1133 #else
1134 #define wxMBConvUTF32swap wxMBConvUTF32BE
1135 #define wxMBConvUTF32straight wxMBConvUTF32LE
1136 #endif
1137
1138
1139 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1140 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1141
1142
1143 #ifdef WC_UTF16
1144
1145 // copy 32bit MB to 16bit String
1146 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1147 {
1148 size_t len=0;
1149
1150 while (*(wxUint32*)psz && (!buf || len < n))
1151 {
1152 wxUint16 cc[2];
1153
1154 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1155 if (pa == (size_t)-1)
1156 return pa;
1157
1158 if (buf)
1159 {
1160 *buf++ = cc[0];
1161 if (pa > 1)
1162 *buf++ = cc[1];
1163 }
1164 len += pa;
1165 psz += sizeof(wxUint32);
1166 }
1167 if (buf && len<n) *buf=0;
1168
1169 return len;
1170 }
1171
1172
1173 // copy 16bit String to 32bit MB
1174 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1175 {
1176 size_t len=0;
1177
1178 while (*psz && (!buf || len < n))
1179 {
1180 wxUint32 cc;
1181
1182 // cast is ok for WC_UTF16
1183 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1184 if (pa == (size_t)-1)
1185 return pa;
1186
1187 if (buf)
1188 {
1189 *(wxUint32*)buf = cc;
1190 buf += sizeof(wxUint32);
1191 }
1192 len += sizeof(wxUint32);
1193 psz += pa;
1194 }
1195
1196 if (buf && len<=n-sizeof(wxUint32))
1197 *(wxUint32*)buf=0;
1198
1199 return len;
1200 }
1201
1202
1203
1204 // swap 32bit MB to 16bit String
1205 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1206 {
1207 size_t len=0;
1208
1209 while (*(wxUint32*)psz && (!buf || len < n))
1210 {
1211 char tmp[4];
1212 tmp[0] = psz[3]; tmp[1] = psz[2];
1213 tmp[2] = psz[1]; tmp[3] = psz[0];
1214
1215
1216 wxUint16 cc[2];
1217
1218 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1219 if (pa == (size_t)-1)
1220 return pa;
1221
1222 if (buf)
1223 {
1224 *buf++ = cc[0];
1225 if (pa > 1)
1226 *buf++ = cc[1];
1227 }
1228 len += pa;
1229 psz += sizeof(wxUint32);
1230 }
1231
1232 if (buf && len<n)
1233 *buf=0;
1234
1235 return len;
1236 }
1237
1238
1239 // swap 16bit String to 32bit MB
1240 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1241 {
1242 size_t len=0;
1243
1244 while (*psz && (!buf || len < n))
1245 {
1246 char cc[4];
1247
1248 // cast is ok for WC_UTF16
1249 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1250 if (pa == (size_t)-1)
1251 return pa;
1252
1253 if (buf)
1254 {
1255 *buf++ = cc[3];
1256 *buf++ = cc[2];
1257 *buf++ = cc[1];
1258 *buf++ = cc[0];
1259 }
1260 len += sizeof(wxUint32);
1261 psz += pa;
1262 }
1263
1264 if (buf && len<=n-sizeof(wxUint32))
1265 *(wxUint32*)buf=0;
1266
1267 return len;
1268 }
1269
1270 #else // WC_UTF16
1271
1272
1273 // copy 32bit MB to 32bit String
1274 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1275 {
1276 size_t len=0;
1277
1278 while (*(wxUint32*)psz && (!buf || len < n))
1279 {
1280 if (buf)
1281 *buf++ = (wchar_t)(*(wxUint32*)psz);
1282 len++;
1283 psz += sizeof(wxUint32);
1284 }
1285
1286 if (buf && len<n)
1287 *buf=0;
1288
1289 return len;
1290 }
1291
1292
1293 // copy 32bit String to 32bit MB
1294 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1295 {
1296 size_t len=0;
1297
1298 while (*psz && (!buf || len < n))
1299 {
1300 if (buf)
1301 {
1302 *(wxUint32*)buf = *psz;
1303 buf += sizeof(wxUint32);
1304 }
1305
1306 len += sizeof(wxUint32);
1307 psz++;
1308 }
1309
1310 if (buf && len<=n-sizeof(wxUint32))
1311 *(wxUint32*)buf=0;
1312
1313 return len;
1314 }
1315
1316
1317 // swap 32bit MB to 32bit String
1318 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1319 {
1320 size_t len=0;
1321
1322 while (*(wxUint32*)psz && (!buf || len < n))
1323 {
1324 if (buf)
1325 {
1326 ((char *)buf)[0] = psz[3];
1327 ((char *)buf)[1] = psz[2];
1328 ((char *)buf)[2] = psz[1];
1329 ((char *)buf)[3] = psz[0];
1330 buf++;
1331 }
1332 len++;
1333 psz += sizeof(wxUint32);
1334 }
1335
1336 if (buf && len<n)
1337 *buf=0;
1338
1339 return len;
1340 }
1341
1342
1343 // swap 32bit String to 32bit MB
1344 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1345 {
1346 size_t len=0;
1347
1348 while (*psz && (!buf || len < n))
1349 {
1350 if (buf)
1351 {
1352 *buf++ = ((char *)psz)[3];
1353 *buf++ = ((char *)psz)[2];
1354 *buf++ = ((char *)psz)[1];
1355 *buf++ = ((char *)psz)[0];
1356 }
1357 len += sizeof(wxUint32);
1358 psz++;
1359 }
1360
1361 if (buf && len<=n-sizeof(wxUint32))
1362 *(wxUint32*)buf=0;
1363
1364 return len;
1365 }
1366
1367
1368 #endif // WC_UTF16
1369
1370
1371 // ============================================================================
1372 // The classes doing conversion using the iconv_xxx() functions
1373 // ============================================================================
1374
1375 #ifdef HAVE_ICONV
1376
1377 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1378 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1379 // (unless there's yet another bug in glibc) the only case when iconv()
1380 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1381 // left in the input buffer -- when _real_ error occurs,
1382 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1383 // iconv() failure.
1384 // [This bug does not appear in glibc 2.2.]
1385 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1386 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1387 (errno != E2BIG || bufLeft != 0))
1388 #else
1389 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1390 #endif
1391
1392 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1393
1394 #define ICONV_T_INVALID ((iconv_t)-1)
1395
1396 #if SIZEOF_WCHAR_T == 4
1397 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1398 #define WC_ENC wxFONTENCODING_UTF32
1399 #elif SIZEOF_WCHAR_T == 2
1400 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1401 #define WC_ENC wxFONTENCODING_UTF16
1402 #else // sizeof(wchar_t) != 2 nor 4
1403 // does this ever happen?
1404 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1405 #endif
1406
1407 // ----------------------------------------------------------------------------
1408 // wxMBConv_iconv: encapsulates an iconv character set
1409 // ----------------------------------------------------------------------------
1410
1411 class wxMBConv_iconv : public wxMBConv
1412 {
1413 public:
1414 wxMBConv_iconv(const wxChar *name);
1415 virtual ~wxMBConv_iconv();
1416
1417 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1418 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1419
1420 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1421 virtual size_t GetMBNulLen() const;
1422
1423 virtual wxMBConv *Clone() const
1424 {
1425 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1426 p->m_minMBCharWidth = m_minMBCharWidth;
1427 return p;
1428 }
1429
1430 bool IsOk() const
1431 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1432
1433 protected:
1434 // the iconv handlers used to translate from multibyte to wide char and in
1435 // the other direction
1436 iconv_t m2w,
1437 w2m;
1438 #if wxUSE_THREADS
1439 // guards access to m2w and w2m objects
1440 wxMutex m_iconvMutex;
1441 #endif
1442
1443 private:
1444 // the name (for iconv_open()) of a wide char charset -- if none is
1445 // available on this machine, it will remain NULL
1446 static wxString ms_wcCharsetName;
1447
1448 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1449 // different endian-ness than the native one
1450 static bool ms_wcNeedsSwap;
1451
1452
1453 // name of the encoding handled by this conversion
1454 wxString m_name;
1455
1456 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1457 // initially
1458 size_t m_minMBCharWidth;
1459 };
1460
1461 // make the constructor available for unit testing
1462 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1463 {
1464 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1465 if ( !result->IsOk() )
1466 {
1467 delete result;
1468 return 0;
1469 }
1470 return result;
1471 }
1472
1473 wxString wxMBConv_iconv::ms_wcCharsetName;
1474 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1475
1476 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1477 : m_name(name)
1478 {
1479 m_minMBCharWidth = 0;
1480
1481 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1482 // names for the charsets
1483 const wxCharBuffer cname(wxString(name).ToAscii());
1484
1485 // check for charset that represents wchar_t:
1486 if ( ms_wcCharsetName.empty() )
1487 {
1488 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1489
1490 #if wxUSE_FONTMAP
1491 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1492 #else // !wxUSE_FONTMAP
1493 static const wxChar *names[] =
1494 {
1495 #if SIZEOF_WCHAR_T == 4
1496 _T("UCS-4"),
1497 #elif SIZEOF_WCHAR_T = 2
1498 _T("UCS-2"),
1499 #endif
1500 NULL
1501 };
1502 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1503
1504 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1505 {
1506 const wxString nameCS(*names);
1507
1508 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1509 wxString nameXE(nameCS);
1510 #ifdef WORDS_BIGENDIAN
1511 nameXE += _T("BE");
1512 #else // little endian
1513 nameXE += _T("LE");
1514 #endif
1515
1516 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1517 nameXE.c_str());
1518
1519 m2w = iconv_open(nameXE.ToAscii(), cname);
1520 if ( m2w == ICONV_T_INVALID )
1521 {
1522 // try charset w/o bytesex info (e.g. "UCS4")
1523 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1524 nameCS.c_str());
1525 m2w = iconv_open(nameCS.ToAscii(), cname);
1526
1527 // and check for bytesex ourselves:
1528 if ( m2w != ICONV_T_INVALID )
1529 {
1530 char buf[2], *bufPtr;
1531 wchar_t wbuf[2], *wbufPtr;
1532 size_t insz, outsz;
1533 size_t res;
1534
1535 buf[0] = 'A';
1536 buf[1] = 0;
1537 wbuf[0] = 0;
1538 insz = 2;
1539 outsz = SIZEOF_WCHAR_T * 2;
1540 wbufPtr = wbuf;
1541 bufPtr = buf;
1542
1543 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1544 (char**)&wbufPtr, &outsz);
1545
1546 if (ICONV_FAILED(res, insz))
1547 {
1548 wxLogLastError(wxT("iconv"));
1549 wxLogError(_("Conversion to charset '%s' doesn't work."),
1550 nameCS.c_str());
1551 }
1552 else // ok, can convert to this encoding, remember it
1553 {
1554 ms_wcCharsetName = nameCS;
1555 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1556 }
1557 }
1558 }
1559 else // use charset not requiring byte swapping
1560 {
1561 ms_wcCharsetName = nameXE;
1562 }
1563 }
1564
1565 wxLogTrace(TRACE_STRCONV,
1566 wxT("iconv wchar_t charset is \"%s\"%s"),
1567 ms_wcCharsetName.empty() ? _T("<none>")
1568 : ms_wcCharsetName.c_str(),
1569 ms_wcNeedsSwap ? _T(" (needs swap)")
1570 : _T(""));
1571 }
1572 else // we already have ms_wcCharsetName
1573 {
1574 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1575 }
1576
1577 if ( ms_wcCharsetName.empty() )
1578 {
1579 w2m = ICONV_T_INVALID;
1580 }
1581 else
1582 {
1583 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1584 if ( w2m == ICONV_T_INVALID )
1585 {
1586 wxLogTrace(TRACE_STRCONV,
1587 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1588 ms_wcCharsetName.c_str(), cname.data());
1589 }
1590 }
1591 }
1592
1593 wxMBConv_iconv::~wxMBConv_iconv()
1594 {
1595 if ( m2w != ICONV_T_INVALID )
1596 iconv_close(m2w);
1597 if ( w2m != ICONV_T_INVALID )
1598 iconv_close(w2m);
1599 }
1600
1601 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1602 {
1603 // find the string length: notice that must be done differently for
1604 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1605 size_t inbuf;
1606 const size_t nulLen = GetMBNulLen();
1607 switch ( nulLen )
1608 {
1609 default:
1610 return (size_t)-1;
1611
1612 case 1:
1613 inbuf = strlen(psz); // arguably more optimized than our version
1614 break;
1615
1616 case 2:
1617 case 4:
1618 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1619 // they also have to start at character boundary and not span two
1620 // adjacent characters
1621 const char *p;
1622 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1623 ;
1624 inbuf = p - psz;
1625 break;
1626 }
1627
1628 #if wxUSE_THREADS
1629 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1630 // Unfortunately there is a couple of global wxCSConv objects such as
1631 // wxConvLocal that are used all over wx code, so we have to make sure
1632 // the handle is used by at most one thread at the time. Otherwise
1633 // only a few wx classes would be safe to use from non-main threads
1634 // as MB<->WC conversion would fail "randomly".
1635 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1636 #endif // wxUSE_THREADS
1637
1638
1639 size_t outbuf = n * SIZEOF_WCHAR_T;
1640 size_t res, cres;
1641 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1642 wchar_t *bufPtr = buf;
1643 const char *pszPtr = psz;
1644
1645 if (buf)
1646 {
1647 // have destination buffer, convert there
1648 cres = iconv(m2w,
1649 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1650 (char**)&bufPtr, &outbuf);
1651 res = n - (outbuf / SIZEOF_WCHAR_T);
1652
1653 if (ms_wcNeedsSwap)
1654 {
1655 // convert to native endianness
1656 for ( unsigned i = 0; i < res; i++ )
1657 buf[n] = WC_BSWAP(buf[i]);
1658 }
1659
1660 // NUL-terminate the string if there is any space left
1661 if (res < n)
1662 buf[res] = 0;
1663 }
1664 else
1665 {
1666 // no destination buffer... convert using temp buffer
1667 // to calculate destination buffer requirement
1668 wchar_t tbuf[8];
1669 res = 0;
1670 do {
1671 bufPtr = tbuf;
1672 outbuf = 8*SIZEOF_WCHAR_T;
1673
1674 cres = iconv(m2w,
1675 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1676 (char**)&bufPtr, &outbuf );
1677
1678 res += 8-(outbuf/SIZEOF_WCHAR_T);
1679 } while ((cres==(size_t)-1) && (errno==E2BIG));
1680 }
1681
1682 if (ICONV_FAILED(cres, inbuf))
1683 {
1684 //VS: it is ok if iconv fails, hence trace only
1685 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1686 return (size_t)-1;
1687 }
1688
1689 return res;
1690 }
1691
1692 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1693 {
1694 #if wxUSE_THREADS
1695 // NB: explained in MB2WC
1696 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1697 #endif
1698
1699 size_t inlen = wxWcslen(psz);
1700 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1701 size_t outbuf = n;
1702 size_t res, cres;
1703
1704 wchar_t *tmpbuf = 0;
1705
1706 if (ms_wcNeedsSwap)
1707 {
1708 // need to copy to temp buffer to switch endianness
1709 // (doing WC_BSWAP twice on the original buffer won't help, as it
1710 // could be in read-only memory, or be accessed in some other thread)
1711 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1712 for ( size_t i = 0; i < inlen; i++ )
1713 tmpbuf[n] = WC_BSWAP(psz[i]);
1714 tmpbuf[inlen] = L'\0';
1715 psz = tmpbuf;
1716 }
1717
1718 if (buf)
1719 {
1720 // have destination buffer, convert there
1721 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1722
1723 res = n-outbuf;
1724
1725 // NB: iconv was given only wcslen(psz) characters on input, and so
1726 // it couldn't convert the trailing zero. Let's do it ourselves
1727 // if there's some room left for it in the output buffer.
1728 if (res < n)
1729 buf[0] = 0;
1730 }
1731 else
1732 {
1733 // no destination buffer... convert using temp buffer
1734 // to calculate destination buffer requirement
1735 char tbuf[16];
1736 res = 0;
1737 do {
1738 buf = tbuf; outbuf = 16;
1739
1740 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1741
1742 res += 16 - outbuf;
1743 } while ((cres==(size_t)-1) && (errno==E2BIG));
1744 }
1745
1746 if (ms_wcNeedsSwap)
1747 {
1748 free(tmpbuf);
1749 }
1750
1751 if (ICONV_FAILED(cres, inbuf))
1752 {
1753 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1754 return (size_t)-1;
1755 }
1756
1757 return res;
1758 }
1759
1760 size_t wxMBConv_iconv::GetMBNulLen() const
1761 {
1762 if ( m_minMBCharWidth == 0 )
1763 {
1764 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1765
1766 #if wxUSE_THREADS
1767 // NB: explained in MB2WC
1768 wxMutexLocker lock(self->m_iconvMutex);
1769 #endif
1770
1771 wchar_t *wnul = L"";
1772 char buf[8]; // should be enough for NUL in any encoding
1773 size_t inLen = sizeof(wchar_t),
1774 outLen = WXSIZEOF(buf);
1775 char *in = (char *)wnul;
1776 char *out = buf;
1777 if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1778 {
1779 self->m_minMBCharWidth = (size_t)-1;
1780 }
1781 else // ok
1782 {
1783 self->m_minMBCharWidth = out - buf;
1784 }
1785 }
1786
1787 return m_minMBCharWidth;
1788 }
1789
1790 #endif // HAVE_ICONV
1791
1792
1793 // ============================================================================
1794 // Win32 conversion classes
1795 // ============================================================================
1796
1797 #ifdef wxHAVE_WIN32_MB2WC
1798
1799 // from utils.cpp
1800 #if wxUSE_FONTMAP
1801 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1802 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1803 #endif
1804
1805 class wxMBConv_win32 : public wxMBConv
1806 {
1807 public:
1808 wxMBConv_win32()
1809 {
1810 m_CodePage = CP_ACP;
1811 m_minMBCharWidth = 0;
1812 }
1813
1814 wxMBConv_win32(const wxMBConv_win32& conv)
1815 {
1816 m_CodePage = conv.m_CodePage;
1817 m_minMBCharWidth = conv.m_minMBCharWidth;
1818 }
1819
1820 #if wxUSE_FONTMAP
1821 wxMBConv_win32(const wxChar* name)
1822 {
1823 m_CodePage = wxCharsetToCodepage(name);
1824 m_minMBCharWidth = 0;
1825 }
1826
1827 wxMBConv_win32(wxFontEncoding encoding)
1828 {
1829 m_CodePage = wxEncodingToCodepage(encoding);
1830 m_minMBCharWidth = 0;
1831 }
1832 #endif // wxUSE_FONTMAP
1833
1834 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1835 {
1836 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1837 // the behaviour is not compatible with the Unix version (using iconv)
1838 // and break the library itself, e.g. wxTextInputStream::NextChar()
1839 // wouldn't work if reading an incomplete MB char didn't result in an
1840 // error
1841 //
1842 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1843 // Win XP or newer and it is not supported for UTF-[78] so we always
1844 // use our own conversions in this case. See
1845 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1846 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1847 if ( m_CodePage == CP_UTF8 )
1848 {
1849 return wxConvUTF8.MB2WC(buf, psz, n);
1850 }
1851
1852 if ( m_CodePage == CP_UTF7 )
1853 {
1854 return wxConvUTF7.MB2WC(buf, psz, n);
1855 }
1856
1857 int flags = 0;
1858 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
1859 IsAtLeastWin2kSP4() )
1860 {
1861 flags = MB_ERR_INVALID_CHARS;
1862 }
1863
1864 const size_t len = ::MultiByteToWideChar
1865 (
1866 m_CodePage, // code page
1867 flags, // flags: fall on error
1868 psz, // input string
1869 -1, // its length (NUL-terminated)
1870 buf, // output string
1871 buf ? n : 0 // size of output buffer
1872 );
1873 if ( !len )
1874 {
1875 // function totally failed
1876 return (size_t)-1;
1877 }
1878
1879 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1880 // check if we succeeded, by doing a double trip:
1881 if ( !flags && buf )
1882 {
1883 const size_t mbLen = strlen(psz);
1884 wxCharBuffer mbBuf(mbLen);
1885 if ( ::WideCharToMultiByte
1886 (
1887 m_CodePage,
1888 0,
1889 buf,
1890 -1,
1891 mbBuf.data(),
1892 mbLen + 1, // size in bytes, not length
1893 NULL,
1894 NULL
1895 ) == 0 ||
1896 strcmp(mbBuf, psz) != 0 )
1897 {
1898 // we didn't obtain the same thing we started from, hence
1899 // the conversion was lossy and we consider that it failed
1900 return (size_t)-1;
1901 }
1902 }
1903
1904 // note that it returns count of written chars for buf != NULL and size
1905 // of the needed buffer for buf == NULL so in either case the length of
1906 // the string (which never includes the terminating NUL) is one less
1907 return len - 1;
1908 }
1909
1910 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1911 {
1912 /*
1913 we have a problem here: by default, WideCharToMultiByte() may
1914 replace characters unrepresentable in the target code page with bad
1915 quality approximations such as turning "1/2" symbol (U+00BD) into
1916 "1" for the code pages which don't have it and we, obviously, want
1917 to avoid this at any price
1918
1919 the trouble is that this function does it _silently_, i.e. it won't
1920 even tell us whether it did or not... Win98/2000 and higher provide
1921 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1922 we have to resort to a round trip, i.e. check that converting back
1923 results in the same string -- this is, of course, expensive but
1924 otherwise we simply can't be sure to not garble the data.
1925 */
1926
1927 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1928 // it doesn't work with CJK encodings (which we test for rather roughly
1929 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1930 // supporting it
1931 BOOL usedDef wxDUMMY_INITIALIZE(false);
1932 BOOL *pUsedDef;
1933 int flags;
1934 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1935 {
1936 // it's our lucky day
1937 flags = WC_NO_BEST_FIT_CHARS;
1938 pUsedDef = &usedDef;
1939 }
1940 else // old system or unsupported encoding
1941 {
1942 flags = 0;
1943 pUsedDef = NULL;
1944 }
1945
1946 const size_t len = ::WideCharToMultiByte
1947 (
1948 m_CodePage, // code page
1949 flags, // either none or no best fit
1950 pwz, // input string
1951 -1, // it is (wide) NUL-terminated
1952 buf, // output buffer
1953 buf ? n : 0, // and its size
1954 NULL, // default "replacement" char
1955 pUsedDef // [out] was it used?
1956 );
1957
1958 if ( !len )
1959 {
1960 // function totally failed
1961 return (size_t)-1;
1962 }
1963
1964 // if we were really converting, check if we succeeded
1965 if ( buf )
1966 {
1967 if ( flags )
1968 {
1969 // check if the conversion failed, i.e. if any replacements
1970 // were done
1971 if ( usedDef )
1972 return (size_t)-1;
1973 }
1974 else // we must resort to double tripping...
1975 {
1976 wxWCharBuffer wcBuf(n);
1977 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1978 wcscmp(wcBuf, pwz) != 0 )
1979 {
1980 // we didn't obtain the same thing we started from, hence
1981 // the conversion was lossy and we consider that it failed
1982 return (size_t)-1;
1983 }
1984 }
1985 }
1986
1987 // see the comment above for the reason of "len - 1"
1988 return len - 1;
1989 }
1990
1991 virtual size_t GetMBNulLen() const
1992 {
1993 if ( m_minMBCharWidth == 0 )
1994 {
1995 int len = ::WideCharToMultiByte
1996 (
1997 m_CodePage, // code page
1998 0, // no flags
1999 L"", // input string
2000 1, // translate just the NUL
2001 NULL, // output buffer
2002 0, // and its size
2003 NULL, // no replacement char
2004 NULL // [out] don't care if it was used
2005 );
2006
2007 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2008 switch ( len )
2009 {
2010 default:
2011 wxLogDebug(_T("Unexpected NUL length %d"), len);
2012 // fall through
2013
2014 case 0:
2015 self->m_minMBCharWidth = (size_t)-1;
2016 break;
2017
2018 case 1:
2019 case 2:
2020 case 4:
2021 self->m_minMBCharWidth = len;
2022 break;
2023 }
2024 }
2025
2026 return m_minMBCharWidth;
2027 }
2028
2029 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2030
2031 bool IsOk() const { return m_CodePage != -1; }
2032
2033 private:
2034 static bool CanUseNoBestFit()
2035 {
2036 static int s_isWin98Or2k = -1;
2037
2038 if ( s_isWin98Or2k == -1 )
2039 {
2040 int verMaj, verMin;
2041 switch ( wxGetOsVersion(&verMaj, &verMin) )
2042 {
2043 case wxWIN95:
2044 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2045 break;
2046
2047 case wxWINDOWS_NT:
2048 s_isWin98Or2k = verMaj >= 5;
2049 break;
2050
2051 default:
2052 // unknown, be conseravtive by default
2053 s_isWin98Or2k = 0;
2054 }
2055
2056 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2057 }
2058
2059 return s_isWin98Or2k == 1;
2060 }
2061
2062 static bool IsAtLeastWin2kSP4()
2063 {
2064 #ifdef __WXWINCE__
2065 return false;
2066 #else
2067 static int s_isAtLeastWin2kSP4 = -1;
2068
2069 if ( s_isAtLeastWin2kSP4 == -1 )
2070 {
2071 OSVERSIONINFOEX ver;
2072
2073 memset(&ver, 0, sizeof(ver));
2074 ver.dwOSVersionInfoSize = sizeof(ver);
2075 GetVersionEx((OSVERSIONINFO*)&ver);
2076
2077 s_isAtLeastWin2kSP4 =
2078 ((ver.dwMajorVersion > 5) || // Vista+
2079 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2080 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2081 ver.wServicePackMajor >= 4)) // 2000 SP4+
2082 ? 1 : 0;
2083 }
2084
2085 return s_isAtLeastWin2kSP4 == 1;
2086 #endif
2087 }
2088
2089
2090 // the code page we're working with
2091 long m_CodePage;
2092
2093 // cached result of GetMBNulLen(), set to 0 initially meaning
2094 // "unknown"
2095 size_t m_minMBCharWidth;
2096 };
2097
2098 #endif // wxHAVE_WIN32_MB2WC
2099
2100 // ============================================================================
2101 // Cocoa conversion classes
2102 // ============================================================================
2103
2104 #if defined(__WXCOCOA__)
2105
2106 // RN: There is no UTF-32 support in either Core Foundation or
2107 // Cocoa. Strangely enough, internally Core Foundation uses
2108 // UTF 32 internally quite a bit - its just not public (yet).
2109
2110 #include <CoreFoundation/CFString.h>
2111 #include <CoreFoundation/CFStringEncodingExt.h>
2112
2113 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2114 {
2115 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2116 if ( encoding == wxFONTENCODING_DEFAULT )
2117 {
2118 enc = CFStringGetSystemEncoding();
2119 }
2120 else switch( encoding)
2121 {
2122 case wxFONTENCODING_ISO8859_1 :
2123 enc = kCFStringEncodingISOLatin1 ;
2124 break ;
2125 case wxFONTENCODING_ISO8859_2 :
2126 enc = kCFStringEncodingISOLatin2;
2127 break ;
2128 case wxFONTENCODING_ISO8859_3 :
2129 enc = kCFStringEncodingISOLatin3 ;
2130 break ;
2131 case wxFONTENCODING_ISO8859_4 :
2132 enc = kCFStringEncodingISOLatin4;
2133 break ;
2134 case wxFONTENCODING_ISO8859_5 :
2135 enc = kCFStringEncodingISOLatinCyrillic;
2136 break ;
2137 case wxFONTENCODING_ISO8859_6 :
2138 enc = kCFStringEncodingISOLatinArabic;
2139 break ;
2140 case wxFONTENCODING_ISO8859_7 :
2141 enc = kCFStringEncodingISOLatinGreek;
2142 break ;
2143 case wxFONTENCODING_ISO8859_8 :
2144 enc = kCFStringEncodingISOLatinHebrew;
2145 break ;
2146 case wxFONTENCODING_ISO8859_9 :
2147 enc = kCFStringEncodingISOLatin5;
2148 break ;
2149 case wxFONTENCODING_ISO8859_10 :
2150 enc = kCFStringEncodingISOLatin6;
2151 break ;
2152 case wxFONTENCODING_ISO8859_11 :
2153 enc = kCFStringEncodingISOLatinThai;
2154 break ;
2155 case wxFONTENCODING_ISO8859_13 :
2156 enc = kCFStringEncodingISOLatin7;
2157 break ;
2158 case wxFONTENCODING_ISO8859_14 :
2159 enc = kCFStringEncodingISOLatin8;
2160 break ;
2161 case wxFONTENCODING_ISO8859_15 :
2162 enc = kCFStringEncodingISOLatin9;
2163 break ;
2164
2165 case wxFONTENCODING_KOI8 :
2166 enc = kCFStringEncodingKOI8_R;
2167 break ;
2168 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2169 enc = kCFStringEncodingDOSRussian;
2170 break ;
2171
2172 // case wxFONTENCODING_BULGARIAN :
2173 // enc = ;
2174 // break ;
2175
2176 case wxFONTENCODING_CP437 :
2177 enc =kCFStringEncodingDOSLatinUS ;
2178 break ;
2179 case wxFONTENCODING_CP850 :
2180 enc = kCFStringEncodingDOSLatin1;
2181 break ;
2182 case wxFONTENCODING_CP852 :
2183 enc = kCFStringEncodingDOSLatin2;
2184 break ;
2185 case wxFONTENCODING_CP855 :
2186 enc = kCFStringEncodingDOSCyrillic;
2187 break ;
2188 case wxFONTENCODING_CP866 :
2189 enc =kCFStringEncodingDOSRussian ;
2190 break ;
2191 case wxFONTENCODING_CP874 :
2192 enc = kCFStringEncodingDOSThai;
2193 break ;
2194 case wxFONTENCODING_CP932 :
2195 enc = kCFStringEncodingDOSJapanese;
2196 break ;
2197 case wxFONTENCODING_CP936 :
2198 enc =kCFStringEncodingDOSChineseSimplif ;
2199 break ;
2200 case wxFONTENCODING_CP949 :
2201 enc = kCFStringEncodingDOSKorean;
2202 break ;
2203 case wxFONTENCODING_CP950 :
2204 enc = kCFStringEncodingDOSChineseTrad;
2205 break ;
2206 case wxFONTENCODING_CP1250 :
2207 enc = kCFStringEncodingWindowsLatin2;
2208 break ;
2209 case wxFONTENCODING_CP1251 :
2210 enc =kCFStringEncodingWindowsCyrillic ;
2211 break ;
2212 case wxFONTENCODING_CP1252 :
2213 enc =kCFStringEncodingWindowsLatin1 ;
2214 break ;
2215 case wxFONTENCODING_CP1253 :
2216 enc = kCFStringEncodingWindowsGreek;
2217 break ;
2218 case wxFONTENCODING_CP1254 :
2219 enc = kCFStringEncodingWindowsLatin5;
2220 break ;
2221 case wxFONTENCODING_CP1255 :
2222 enc =kCFStringEncodingWindowsHebrew ;
2223 break ;
2224 case wxFONTENCODING_CP1256 :
2225 enc =kCFStringEncodingWindowsArabic ;
2226 break ;
2227 case wxFONTENCODING_CP1257 :
2228 enc = kCFStringEncodingWindowsBalticRim;
2229 break ;
2230 // This only really encodes to UTF7 (if that) evidently
2231 // case wxFONTENCODING_UTF7 :
2232 // enc = kCFStringEncodingNonLossyASCII ;
2233 // break ;
2234 case wxFONTENCODING_UTF8 :
2235 enc = kCFStringEncodingUTF8 ;
2236 break ;
2237 case wxFONTENCODING_EUC_JP :
2238 enc = kCFStringEncodingEUC_JP;
2239 break ;
2240 case wxFONTENCODING_UTF16 :
2241 enc = kCFStringEncodingUnicode ;
2242 break ;
2243 case wxFONTENCODING_MACROMAN :
2244 enc = kCFStringEncodingMacRoman ;
2245 break ;
2246 case wxFONTENCODING_MACJAPANESE :
2247 enc = kCFStringEncodingMacJapanese ;
2248 break ;
2249 case wxFONTENCODING_MACCHINESETRAD :
2250 enc = kCFStringEncodingMacChineseTrad ;
2251 break ;
2252 case wxFONTENCODING_MACKOREAN :
2253 enc = kCFStringEncodingMacKorean ;
2254 break ;
2255 case wxFONTENCODING_MACARABIC :
2256 enc = kCFStringEncodingMacArabic ;
2257 break ;
2258 case wxFONTENCODING_MACHEBREW :
2259 enc = kCFStringEncodingMacHebrew ;
2260 break ;
2261 case wxFONTENCODING_MACGREEK :
2262 enc = kCFStringEncodingMacGreek ;
2263 break ;
2264 case wxFONTENCODING_MACCYRILLIC :
2265 enc = kCFStringEncodingMacCyrillic ;
2266 break ;
2267 case wxFONTENCODING_MACDEVANAGARI :
2268 enc = kCFStringEncodingMacDevanagari ;
2269 break ;
2270 case wxFONTENCODING_MACGURMUKHI :
2271 enc = kCFStringEncodingMacGurmukhi ;
2272 break ;
2273 case wxFONTENCODING_MACGUJARATI :
2274 enc = kCFStringEncodingMacGujarati ;
2275 break ;
2276 case wxFONTENCODING_MACORIYA :
2277 enc = kCFStringEncodingMacOriya ;
2278 break ;
2279 case wxFONTENCODING_MACBENGALI :
2280 enc = kCFStringEncodingMacBengali ;
2281 break ;
2282 case wxFONTENCODING_MACTAMIL :
2283 enc = kCFStringEncodingMacTamil ;
2284 break ;
2285 case wxFONTENCODING_MACTELUGU :
2286 enc = kCFStringEncodingMacTelugu ;
2287 break ;
2288 case wxFONTENCODING_MACKANNADA :
2289 enc = kCFStringEncodingMacKannada ;
2290 break ;
2291 case wxFONTENCODING_MACMALAJALAM :
2292 enc = kCFStringEncodingMacMalayalam ;
2293 break ;
2294 case wxFONTENCODING_MACSINHALESE :
2295 enc = kCFStringEncodingMacSinhalese ;
2296 break ;
2297 case wxFONTENCODING_MACBURMESE :
2298 enc = kCFStringEncodingMacBurmese ;
2299 break ;
2300 case wxFONTENCODING_MACKHMER :
2301 enc = kCFStringEncodingMacKhmer ;
2302 break ;
2303 case wxFONTENCODING_MACTHAI :
2304 enc = kCFStringEncodingMacThai ;
2305 break ;
2306 case wxFONTENCODING_MACLAOTIAN :
2307 enc = kCFStringEncodingMacLaotian ;
2308 break ;
2309 case wxFONTENCODING_MACGEORGIAN :
2310 enc = kCFStringEncodingMacGeorgian ;
2311 break ;
2312 case wxFONTENCODING_MACARMENIAN :
2313 enc = kCFStringEncodingMacArmenian ;
2314 break ;
2315 case wxFONTENCODING_MACCHINESESIMP :
2316 enc = kCFStringEncodingMacChineseSimp ;
2317 break ;
2318 case wxFONTENCODING_MACTIBETAN :
2319 enc = kCFStringEncodingMacTibetan ;
2320 break ;
2321 case wxFONTENCODING_MACMONGOLIAN :
2322 enc = kCFStringEncodingMacMongolian ;
2323 break ;
2324 case wxFONTENCODING_MACETHIOPIC :
2325 enc = kCFStringEncodingMacEthiopic ;
2326 break ;
2327 case wxFONTENCODING_MACCENTRALEUR :
2328 enc = kCFStringEncodingMacCentralEurRoman ;
2329 break ;
2330 case wxFONTENCODING_MACVIATNAMESE :
2331 enc = kCFStringEncodingMacVietnamese ;
2332 break ;
2333 case wxFONTENCODING_MACARABICEXT :
2334 enc = kCFStringEncodingMacExtArabic ;
2335 break ;
2336 case wxFONTENCODING_MACSYMBOL :
2337 enc = kCFStringEncodingMacSymbol ;
2338 break ;
2339 case wxFONTENCODING_MACDINGBATS :
2340 enc = kCFStringEncodingMacDingbats ;
2341 break ;
2342 case wxFONTENCODING_MACTURKISH :
2343 enc = kCFStringEncodingMacTurkish ;
2344 break ;
2345 case wxFONTENCODING_MACCROATIAN :
2346 enc = kCFStringEncodingMacCroatian ;
2347 break ;
2348 case wxFONTENCODING_MACICELANDIC :
2349 enc = kCFStringEncodingMacIcelandic ;
2350 break ;
2351 case wxFONTENCODING_MACROMANIAN :
2352 enc = kCFStringEncodingMacRomanian ;
2353 break ;
2354 case wxFONTENCODING_MACCELTIC :
2355 enc = kCFStringEncodingMacCeltic ;
2356 break ;
2357 case wxFONTENCODING_MACGAELIC :
2358 enc = kCFStringEncodingMacGaelic ;
2359 break ;
2360 // case wxFONTENCODING_MACKEYBOARD :
2361 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2362 // break ;
2363 default :
2364 // because gcc is picky
2365 break ;
2366 } ;
2367 return enc ;
2368 }
2369
2370 class wxMBConv_cocoa : public wxMBConv
2371 {
2372 public:
2373 wxMBConv_cocoa()
2374 {
2375 Init(CFStringGetSystemEncoding()) ;
2376 }
2377
2378 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2379 {
2380 m_encoding = conv.m_encoding;
2381 }
2382
2383 #if wxUSE_FONTMAP
2384 wxMBConv_cocoa(const wxChar* name)
2385 {
2386 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2387 }
2388 #endif
2389
2390 wxMBConv_cocoa(wxFontEncoding encoding)
2391 {
2392 Init( wxCFStringEncFromFontEnc(encoding) );
2393 }
2394
2395 ~wxMBConv_cocoa()
2396 {
2397 }
2398
2399 void Init( CFStringEncoding encoding)
2400 {
2401 m_encoding = encoding ;
2402 }
2403
2404 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2405 {
2406 wxASSERT(szUnConv);
2407
2408 CFStringRef theString = CFStringCreateWithBytes (
2409 NULL, //the allocator
2410 (const UInt8*)szUnConv,
2411 strlen(szUnConv),
2412 m_encoding,
2413 false //no BOM/external representation
2414 );
2415
2416 wxASSERT(theString);
2417
2418 size_t nOutLength = CFStringGetLength(theString);
2419
2420 if (szOut == NULL)
2421 {
2422 CFRelease(theString);
2423 return nOutLength;
2424 }
2425
2426 CFRange theRange = { 0, nOutSize };
2427
2428 #if SIZEOF_WCHAR_T == 4
2429 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2430 #endif
2431
2432 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2433
2434 CFRelease(theString);
2435
2436 szUniCharBuffer[nOutLength] = '\0' ;
2437
2438 #if SIZEOF_WCHAR_T == 4
2439 wxMBConvUTF16 converter ;
2440 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2441 delete[] szUniCharBuffer;
2442 #endif
2443
2444 return nOutLength;
2445 }
2446
2447 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2448 {
2449 wxASSERT(szUnConv);
2450
2451 size_t nRealOutSize;
2452 size_t nBufSize = wxWcslen(szUnConv);
2453 UniChar* szUniBuffer = (UniChar*) szUnConv;
2454
2455 #if SIZEOF_WCHAR_T == 4
2456 wxMBConvUTF16 converter ;
2457 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2458 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2459 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2460 nBufSize /= sizeof(UniChar);
2461 #endif
2462
2463 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2464 NULL, //allocator
2465 szUniBuffer,
2466 nBufSize,
2467 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2468 );
2469
2470 wxASSERT(theString);
2471
2472 //Note that CER puts a BOM when converting to unicode
2473 //so we check and use getchars instead in that case
2474 if (m_encoding == kCFStringEncodingUnicode)
2475 {
2476 if (szOut != NULL)
2477 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2478
2479 nRealOutSize = CFStringGetLength(theString) + 1;
2480 }
2481 else
2482 {
2483 CFStringGetBytes(
2484 theString,
2485 CFRangeMake(0, CFStringGetLength(theString)),
2486 m_encoding,
2487 0, //what to put in characters that can't be converted -
2488 //0 tells CFString to return NULL if it meets such a character
2489 false, //not an external representation
2490 (UInt8*) szOut,
2491 nOutSize,
2492 (CFIndex*) &nRealOutSize
2493 );
2494 }
2495
2496 CFRelease(theString);
2497
2498 #if SIZEOF_WCHAR_T == 4
2499 delete[] szUniBuffer;
2500 #endif
2501
2502 return nRealOutSize - 1;
2503 }
2504
2505 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2506
2507 bool IsOk() const
2508
2509 bool IsOk() const
2510 {
2511 return m_encoding != kCFStringEncodingInvalidId &&
2512 CFStringIsEncodingAvailable(m_encoding);
2513 }
2514
2515 private:
2516 CFStringEncoding m_encoding ;
2517 };
2518
2519 #endif // defined(__WXCOCOA__)
2520
2521 // ============================================================================
2522 // Mac conversion classes
2523 // ============================================================================
2524
2525 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2526
2527 class wxMBConv_mac : public wxMBConv
2528 {
2529 public:
2530 wxMBConv_mac()
2531 {
2532 Init(CFStringGetSystemEncoding()) ;
2533 }
2534
2535 wxMBConv_mac(const wxMBConv_mac& conv)
2536 {
2537 Init(conv.m_char_encoding);
2538 }
2539
2540 #if wxUSE_FONTMAP
2541 wxMBConv_mac(const wxChar* name)
2542 {
2543 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2544 }
2545 #endif
2546
2547 wxMBConv_mac(wxFontEncoding encoding)
2548 {
2549 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2550 }
2551
2552 ~wxMBConv_mac()
2553 {
2554 OSStatus status = noErr ;
2555 status = TECDisposeConverter(m_MB2WC_converter);
2556 status = TECDisposeConverter(m_WC2MB_converter);
2557 }
2558
2559
2560 void Init( TextEncodingBase encoding)
2561 {
2562 OSStatus status = noErr ;
2563 m_char_encoding = encoding ;
2564 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2565
2566 status = TECCreateConverter(&m_MB2WC_converter,
2567 m_char_encoding,
2568 m_unicode_encoding);
2569 status = TECCreateConverter(&m_WC2MB_converter,
2570 m_unicode_encoding,
2571 m_char_encoding);
2572 }
2573
2574 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2575 {
2576 OSStatus status = noErr ;
2577 ByteCount byteOutLen ;
2578 ByteCount byteInLen = strlen(psz) ;
2579 wchar_t *tbuf = NULL ;
2580 UniChar* ubuf = NULL ;
2581 size_t res = 0 ;
2582
2583 if (buf == NULL)
2584 {
2585 //apple specs say at least 32
2586 n = wxMax( 32 , byteInLen ) ;
2587 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2588 }
2589 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2590 #if SIZEOF_WCHAR_T == 4
2591 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2592 #else
2593 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2594 #endif
2595 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2596 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2597 #if SIZEOF_WCHAR_T == 4
2598 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2599 // is not properly terminated we get random characters at the end
2600 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2601 wxMBConvUTF16 converter ;
2602 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2603 free( ubuf ) ;
2604 #else
2605 res = byteOutLen / sizeof( UniChar ) ;
2606 #endif
2607 if ( buf == NULL )
2608 free(tbuf) ;
2609
2610 if ( buf && res < n)
2611 buf[res] = 0;
2612
2613 return res ;
2614 }
2615
2616 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2617 {
2618 OSStatus status = noErr ;
2619 ByteCount byteOutLen ;
2620 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2621
2622 char *tbuf = NULL ;
2623
2624 if (buf == NULL)
2625 {
2626 //apple specs say at least 32
2627 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2628 tbuf = (char*) malloc( n ) ;
2629 }
2630
2631 ByteCount byteBufferLen = n ;
2632 UniChar* ubuf = NULL ;
2633 #if SIZEOF_WCHAR_T == 4
2634 wxMBConvUTF16 converter ;
2635 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2636 byteInLen = unicharlen ;
2637 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2638 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2639 #else
2640 ubuf = (UniChar*) psz ;
2641 #endif
2642 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2643 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2644 #if SIZEOF_WCHAR_T == 4
2645 free( ubuf ) ;
2646 #endif
2647 if ( buf == NULL )
2648 free(tbuf) ;
2649
2650 size_t res = byteOutLen ;
2651 if ( buf && res < n)
2652 {
2653 buf[res] = 0;
2654
2655 //we need to double-trip to verify it didn't insert any ? in place
2656 //of bogus characters
2657 wxWCharBuffer wcBuf(n);
2658 size_t pszlen = wxWcslen(psz);
2659 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2660 wxWcslen(wcBuf) != pszlen ||
2661 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2662 {
2663 // we didn't obtain the same thing we started from, hence
2664 // the conversion was lossy and we consider that it failed
2665 return (size_t)-1;
2666 }
2667 }
2668
2669 return res ;
2670 }
2671
2672 virtual wxMBConv *Clone() const { return wxMBConv_mac(*this); }
2673
2674 bool IsOk() const
2675 bool IsOk() const
2676 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2677
2678 private:
2679 TECObjectRef m_MB2WC_converter ;
2680 TECObjectRef m_WC2MB_converter ;
2681
2682 TextEncodingBase m_char_encoding ;
2683 TextEncodingBase m_unicode_encoding ;
2684 };
2685
2686 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2687
2688 // ============================================================================
2689 // wxEncodingConverter based conversion classes
2690 // ============================================================================
2691
2692 #if wxUSE_FONTMAP
2693
2694 class wxMBConv_wxwin : public wxMBConv
2695 {
2696 private:
2697 void Init()
2698 {
2699 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2700 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2701 }
2702
2703 public:
2704 // temporarily just use wxEncodingConverter stuff,
2705 // so that it works while a better implementation is built
2706 wxMBConv_wxwin(const wxChar* name)
2707 {
2708 if (name)
2709 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2710 else
2711 m_enc = wxFONTENCODING_SYSTEM;
2712
2713 Init();
2714 }
2715
2716 wxMBConv_wxwin(wxFontEncoding enc)
2717 {
2718 m_enc = enc;
2719
2720 Init();
2721 }
2722
2723 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2724 {
2725 size_t inbuf = strlen(psz);
2726 if (buf)
2727 {
2728 if (!m2w.Convert(psz,buf))
2729 return (size_t)-1;
2730 }
2731 return inbuf;
2732 }
2733
2734 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2735 {
2736 const size_t inbuf = wxWcslen(psz);
2737 if (buf)
2738 {
2739 if (!w2m.Convert(psz,buf))
2740 return (size_t)-1;
2741 }
2742
2743 return inbuf;
2744 }
2745
2746 virtual size_t GetMBNulLen() const
2747 {
2748 switch ( m_enc )
2749 {
2750 case wxFONTENCODING_UTF16BE:
2751 case wxFONTENCODING_UTF16LE:
2752 return 2;
2753
2754 case wxFONTENCODING_UTF32BE:
2755 case wxFONTENCODING_UTF32LE:
2756 return 4;
2757
2758 default:
2759 return 1;
2760 }
2761 }
2762
2763 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2764
2765 bool IsOk() const { return m_ok; }
2766
2767 public:
2768 wxFontEncoding m_enc;
2769 wxEncodingConverter m2w, w2m;
2770
2771 private:
2772 // were we initialized successfully?
2773 bool m_ok;
2774
2775 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2776 };
2777
2778 // make the constructors available for unit testing
2779 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2780 {
2781 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2782 if ( !result->IsOk() )
2783 {
2784 delete result;
2785 return 0;
2786 }
2787 return result;
2788 }
2789
2790 #endif // wxUSE_FONTMAP
2791
2792 // ============================================================================
2793 // wxCSConv implementation
2794 // ============================================================================
2795
2796 void wxCSConv::Init()
2797 {
2798 m_name = NULL;
2799 m_convReal = NULL;
2800 m_deferred = true;
2801 }
2802
2803 wxCSConv::wxCSConv(const wxChar *charset)
2804 {
2805 Init();
2806
2807 if ( charset )
2808 {
2809 SetName(charset);
2810 }
2811
2812 #if wxUSE_FONTMAP
2813 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2814 #else
2815 m_encoding = wxFONTENCODING_SYSTEM;
2816 #endif
2817 }
2818
2819 wxCSConv::wxCSConv(wxFontEncoding encoding)
2820 {
2821 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2822 {
2823 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2824
2825 encoding = wxFONTENCODING_SYSTEM;
2826 }
2827
2828 Init();
2829
2830 m_encoding = encoding;
2831 }
2832
2833 wxCSConv::~wxCSConv()
2834 {
2835 Clear();
2836 }
2837
2838 wxCSConv::wxCSConv(const wxCSConv& conv)
2839 : wxMBConv()
2840 {
2841 Init();
2842
2843 SetName(conv.m_name);
2844 m_encoding = conv.m_encoding;
2845 }
2846
2847 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2848 {
2849 Clear();
2850
2851 SetName(conv.m_name);
2852 m_encoding = conv.m_encoding;
2853
2854 return *this;
2855 }
2856
2857 void wxCSConv::Clear()
2858 {
2859 free(m_name);
2860 delete m_convReal;
2861
2862 m_name = NULL;
2863 m_convReal = NULL;
2864 }
2865
2866 void wxCSConv::SetName(const wxChar *charset)
2867 {
2868 if (charset)
2869 {
2870 m_name = wxStrdup(charset);
2871 m_deferred = true;
2872 }
2873 }
2874
2875 #if wxUSE_FONTMAP
2876 #include "wx/hashmap.h"
2877
2878 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2879 wxEncodingNameCache );
2880
2881 static wxEncodingNameCache gs_nameCache;
2882 #endif
2883
2884 wxMBConv *wxCSConv::DoCreate() const
2885 {
2886 #if wxUSE_FONTMAP
2887 wxLogTrace(TRACE_STRCONV,
2888 wxT("creating conversion for %s"),
2889 (m_name ? m_name
2890 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2891 #endif // wxUSE_FONTMAP
2892
2893 // check for the special case of ASCII or ISO8859-1 charset: as we have
2894 // special knowledge of it anyhow, we don't need to create a special
2895 // conversion object
2896 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2897 m_encoding == wxFONTENCODING_DEFAULT )
2898 {
2899 // don't convert at all
2900 return NULL;
2901 }
2902
2903 // we trust OS to do conversion better than we can so try external
2904 // conversion methods first
2905 //
2906 // the full order is:
2907 // 1. OS conversion (iconv() under Unix or Win32 API)
2908 // 2. hard coded conversions for UTF
2909 // 3. wxEncodingConverter as fall back
2910
2911 // step (1)
2912 #ifdef HAVE_ICONV
2913 #if !wxUSE_FONTMAP
2914 if ( m_name )
2915 #endif // !wxUSE_FONTMAP
2916 {
2917 wxString name(m_name);
2918 wxFontEncoding encoding(m_encoding);
2919
2920 if ( !name.empty() )
2921 {
2922 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2923 if ( conv->IsOk() )
2924 return conv;
2925
2926 delete conv;
2927
2928 #if wxUSE_FONTMAP
2929 encoding =
2930 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2931 #endif // wxUSE_FONTMAP
2932 }
2933 #if wxUSE_FONTMAP
2934 {
2935 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2936 if ( it != gs_nameCache.end() )
2937 {
2938 if ( it->second.empty() )
2939 return NULL;
2940
2941 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2942 if ( conv->IsOk() )
2943 return conv;
2944
2945 delete conv;
2946 }
2947
2948 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2949
2950 for ( ; *names; ++names )
2951 {
2952 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2953 if ( conv->IsOk() )
2954 {
2955 gs_nameCache[encoding] = *names;
2956 return conv;
2957 }
2958
2959 delete conv;
2960 }
2961
2962 gs_nameCache[encoding] = _T(""); // cache the failure
2963 }
2964 #endif // wxUSE_FONTMAP
2965 }
2966 #endif // HAVE_ICONV
2967
2968 #ifdef wxHAVE_WIN32_MB2WC
2969 {
2970 #if wxUSE_FONTMAP
2971 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2972 : new wxMBConv_win32(m_encoding);
2973 if ( conv->IsOk() )
2974 return conv;
2975
2976 delete conv;
2977 #else
2978 return NULL;
2979 #endif
2980 }
2981 #endif // wxHAVE_WIN32_MB2WC
2982 #if defined(__WXMAC__)
2983 {
2984 // leave UTF16 and UTF32 to the built-ins of wx
2985 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2986 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2987 {
2988
2989 #if wxUSE_FONTMAP
2990 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2991 : new wxMBConv_mac(m_encoding);
2992 #else
2993 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2994 #endif
2995 if ( conv->IsOk() )
2996 return conv;
2997
2998 delete conv;
2999 }
3000 }
3001 #endif
3002 #if defined(__WXCOCOA__)
3003 {
3004 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3005 {
3006
3007 #if wxUSE_FONTMAP
3008 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3009 : new wxMBConv_cocoa(m_encoding);
3010 #else
3011 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3012 #endif
3013 if ( conv->IsOk() )
3014 return conv;
3015
3016 delete conv;
3017 }
3018 }
3019 #endif
3020 // step (2)
3021 wxFontEncoding enc = m_encoding;
3022 #if wxUSE_FONTMAP
3023 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3024 {
3025 // use "false" to suppress interactive dialogs -- we can be called from
3026 // anywhere and popping up a dialog from here is the last thing we want to
3027 // do
3028 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3029 }
3030 #endif // wxUSE_FONTMAP
3031
3032 switch ( enc )
3033 {
3034 case wxFONTENCODING_UTF7:
3035 return new wxMBConvUTF7;
3036
3037 case wxFONTENCODING_UTF8:
3038 return new wxMBConvUTF8;
3039
3040 case wxFONTENCODING_UTF16BE:
3041 return new wxMBConvUTF16BE;
3042
3043 case wxFONTENCODING_UTF16LE:
3044 return new wxMBConvUTF16LE;
3045
3046 case wxFONTENCODING_UTF32BE:
3047 return new wxMBConvUTF32BE;
3048
3049 case wxFONTENCODING_UTF32LE:
3050 return new wxMBConvUTF32LE;
3051
3052 default:
3053 // nothing to do but put here to suppress gcc warnings
3054 ;
3055 }
3056
3057 // step (3)
3058 #if wxUSE_FONTMAP
3059 {
3060 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3061 : new wxMBConv_wxwin(m_encoding);
3062 if ( conv->IsOk() )
3063 return conv;
3064
3065 delete conv;
3066 }
3067 #endif // wxUSE_FONTMAP
3068
3069 // NB: This is a hack to prevent deadlock. What could otherwise happen
3070 // in Unicode build: wxConvLocal creation ends up being here
3071 // because of some failure and logs the error. But wxLog will try to
3072 // attach timestamp, for which it will need wxConvLocal (to convert
3073 // time to char* and then wchar_t*), but that fails, tries to log
3074 // error, but wxLog has a (already locked) critical section that
3075 // guards static buffer.
3076 static bool alreadyLoggingError = false;
3077 if (!alreadyLoggingError)
3078 {
3079 alreadyLoggingError = true;
3080 wxLogError(_("Cannot convert from the charset '%s'!"),
3081 m_name ? m_name
3082 :
3083 #if wxUSE_FONTMAP
3084 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3085 #else // !wxUSE_FONTMAP
3086 wxString::Format(_("encoding %s"), m_encoding).c_str()
3087 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3088 );
3089 alreadyLoggingError = false;
3090 }
3091
3092 return NULL;
3093 }
3094
3095 void wxCSConv::CreateConvIfNeeded() const
3096 {
3097 if ( m_deferred )
3098 {
3099 wxCSConv *self = (wxCSConv *)this; // const_cast
3100
3101 #if wxUSE_INTL
3102 // if we don't have neither the name nor the encoding, use the default
3103 // encoding for this system
3104 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3105 {
3106 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3107 }
3108 #endif // wxUSE_INTL
3109
3110 self->m_convReal = DoCreate();
3111 self->m_deferred = false;
3112 }
3113 }
3114
3115 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3116 {
3117 CreateConvIfNeeded();
3118
3119 if (m_convReal)
3120 return m_convReal->MB2WC(buf, psz, n);
3121
3122 // latin-1 (direct)
3123 size_t len = strlen(psz);
3124
3125 if (buf)
3126 {
3127 for (size_t c = 0; c <= len; c++)
3128 buf[c] = (unsigned char)(psz[c]);
3129 }
3130
3131 return len;
3132 }
3133
3134 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3135 {
3136 CreateConvIfNeeded();
3137
3138 if (m_convReal)
3139 return m_convReal->WC2MB(buf, psz, n);
3140
3141 // latin-1 (direct)
3142 const size_t len = wxWcslen(psz);
3143 if (buf)
3144 {
3145 for (size_t c = 0; c <= len; c++)
3146 {
3147 if (psz[c] > 0xFF)
3148 return (size_t)-1;
3149 buf[c] = (char)psz[c];
3150 }
3151 }
3152 else
3153 {
3154 for (size_t c = 0; c <= len; c++)
3155 {
3156 if (psz[c] > 0xFF)
3157 return (size_t)-1;
3158 }
3159 }
3160
3161 return len;
3162 }
3163
3164 size_t wxCSConv::GetMBNulLen() const
3165 {
3166 CreateConvIfNeeded();
3167
3168 if ( m_convReal )
3169 {
3170 return m_convReal->GetMBNulLen();
3171 }
3172
3173 return 1;
3174 }
3175
3176 // ----------------------------------------------------------------------------
3177 // globals
3178 // ----------------------------------------------------------------------------
3179
3180 #ifdef __WINDOWS__
3181 static wxMBConv_win32 wxConvLibcObj;
3182 #elif defined(__WXMAC__) && !defined(__MACH__)
3183 static wxMBConv_mac wxConvLibcObj ;
3184 #else
3185 static wxMBConvLibc wxConvLibcObj;
3186 #endif
3187
3188 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3189 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3190 static wxMBConvUTF7 wxConvUTF7Obj;
3191 static wxMBConvUTF8 wxConvUTF8Obj;
3192
3193 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3194 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3195 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3196 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3197 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3198 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3199 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3200 #ifdef __WXOSX__
3201 wxConvUTF8Obj;
3202 #else
3203 wxConvLibcObj;
3204 #endif
3205
3206
3207 #else // !wxUSE_WCHAR_T
3208
3209 // stand-ins in absence of wchar_t
3210 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3211 wxConvISO8859_1,
3212 wxConvLocal,
3213 wxConvUTF8;
3214
3215 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T