]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
1. changed all "wxMBConv& conv" parameters to "const wxMBConv&"
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
25
26 #ifdef __BORLANDC__
27 #pragma hdrstop
28 #endif
29
30 #ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33 #endif // WX_PRECOMP
34
35 #include "wx/strconv.h"
36
37 #if wxUSE_WCHAR_T
38
39 #ifdef __WINDOWS__
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #endif
43
44 #ifndef __WXWINCE__
45 #include <errno.h>
46 #endif
47
48 #include <ctype.h>
49 #include <string.h>
50 #include <stdlib.h>
51
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
55
56 #ifdef __SALFORDC__
57 #include <clib.h>
58 #endif
59
60 #ifdef HAVE_ICONV
61 #include <iconv.h>
62 #include "wx/thread.h"
63 #endif
64
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
67 #include "wx/utils.h"
68
69 #ifdef __WXMAC__
70 #ifndef __DARWIN__
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
74 #endif
75
76 #include "wx/mac/private.h" // includes mac headers
77 #endif
78
79 #define TRACE_STRCONV _T("strconv")
80
81 #if SIZEOF_WCHAR_T == 2
82 #define WC_UTF16
83 #endif
84
85 // ============================================================================
86 // implementation
87 // ============================================================================
88
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p, size_t n)
91 {
92 while ( n && *p++ == '\0' )
93 n--;
94
95 return n != 0;
96 }
97
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4
100 // ----------------------------------------------------------------------------
101
102
103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
104 {
105 if (input<=0xffff)
106 {
107 if (output)
108 *output = (wxUint16) input;
109 return 1;
110 }
111 else if (input>=0x110000)
112 {
113 return (size_t)-1;
114 }
115 else
116 {
117 if (output)
118 {
119 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
120 *output = (wxUint16) ((input&0x3ff)+0xdc00);
121 }
122 return 2;
123 }
124 }
125
126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
127 {
128 if ((*input<0xd800) || (*input>0xdfff))
129 {
130 output = *input;
131 return 1;
132 }
133 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
134 {
135 output = *input;
136 return (size_t)-1;
137 }
138 else
139 {
140 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
141 return 2;
142 }
143 }
144
145
146 // ----------------------------------------------------------------------------
147 // wxMBConv
148 // ----------------------------------------------------------------------------
149
150 size_t
151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
152 const char *src, size_t srcLen) const
153 {
154 // although new conversion classes are supposed to implement this function
155 // directly, the existins ones only implement the old MB2WC() and so, to
156 // avoid to have to rewrite all conversion classes at once, we provide a
157 // default (but not efficient) implementation of this one in terms of the
158 // old function by copying the input to ensure that it's NUL-terminated and
159 // then using MB2WC() to convert it
160
161 // the number of chars [which would be] written to dst [if it were not NULL]
162 size_t dstWritten = 0;
163
164 // the number of NULs terminating this string
165 size_t nulLen wxDUMMY_INITIALIZE(0);
166
167 // if we were not given the input size we just have to assume that the
168 // string is properly terminated as we have no way of knowing how long it
169 // is anyhow, but if we do have the size check whether there are enough
170 // NULs at the end
171 wxCharBuffer bufTmp;
172 const char *srcEnd;
173 if ( srcLen != (size_t)-1 )
174 {
175 // we need to know how to find the end of this string
176 nulLen = GetMBNulLen();
177 if ( nulLen == wxCONV_FAILED )
178 return wxCONV_FAILED;
179
180 // if there are enough NULs we can avoid the copy
181 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
182 {
183 // make a copy in order to properly NUL-terminate the string
184 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
185 char * const p = bufTmp.data();
186 memcpy(p, src, srcLen);
187 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
188 *s = '\0';
189
190 src = bufTmp;
191 }
192
193 srcEnd = src + srcLen;
194 }
195 else // quit after the first loop iteration
196 {
197 srcEnd = NULL;
198 }
199
200 for ( ;; )
201 {
202 // try to convert the current chunk
203 size_t lenChunk = MB2WC(NULL, src, 0);
204 if ( lenChunk == 0 )
205 {
206 // nothing left in the input string, conversion succeeded; but
207 // still account for the trailing NULL
208 dstWritten++;
209 break;
210 }
211
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
214
215 lenChunk++; // for trailing NUL
216
217 dstWritten += lenChunk;
218
219 if ( dst )
220 {
221 if ( dstWritten > dstLen )
222 return wxCONV_FAILED;
223
224 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
225 return wxCONV_FAILED;
226
227 dst += lenChunk;
228 }
229
230 if ( !srcEnd )
231 {
232 // we convert the entire string in this cas, as we suppose that the
233 // string is NUL-terminated and so srcEnd is not used at all
234 break;
235 }
236
237 // advance the input pointer past the end of this chunk
238 while ( NotAllNULs(src, nulLen) )
239 {
240 // notice that we must skip over multiple bytes here as we suppose
241 // that if NUL takes 2 or 4 bytes, then all the other characters do
242 // too and so if advanced by a single byte we might erroneously
243 // detect sequences of NUL bytes in the middle of the input
244 src += nulLen;
245 }
246
247 src += nulLen; // skipping over its terminator as well
248
249 // note that ">=" (and not just "==") is needed here as the terminator
250 // we skipped just above could be inside or just after the buffer
251 // delimited by inEnd
252 if ( src >= srcEnd )
253 break;
254 }
255
256 return dstWritten;
257 }
258
259 size_t
260 wxMBConv::FromWChar(char *dst, size_t dstLen,
261 const wchar_t *src, size_t srcLen) const
262 {
263 // the number of chars [which would be] written to dst [if it were not NULL]
264 size_t dstWritten = 0;
265
266 // make a copy of the input string unless it is already properly
267 // NUL-terminated
268 //
269 // if we don't know its length we have no choice but to assume that it is,
270 // indeed, properly terminated
271 wxWCharBuffer bufTmp;
272 if ( srcLen == (size_t)-1 )
273 {
274 srcLen = wxWcslen(src) + 1;
275 }
276 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
277 {
278 // make a copy in order to properly NUL-terminate the string
279 bufTmp = wxWCharBuffer(srcLen);
280 memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
281 src = bufTmp;
282 }
283
284 const size_t lenNul = GetMBNulLen();
285 for ( const wchar_t * const srcEnd = src + srcLen;
286 src < srcEnd;
287 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
288 {
289 // try to convert the current chunk
290 size_t lenChunk = WC2MB(NULL, src, 0);
291
292 if ( lenChunk == wxCONV_FAILED )
293 return wxCONV_FAILED;
294
295 lenChunk += lenNul;
296 dstWritten += lenChunk;
297
298 if ( dst )
299 {
300 if ( dstWritten > dstLen )
301 return wxCONV_FAILED;
302
303 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
304 return wxCONV_FAILED;
305
306 dst += lenChunk;
307 }
308 }
309
310 return dstWritten;
311 }
312
313 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
314 {
315 size_t rc = ToWChar(out, outLen, in);
316 if ( rc != wxCONV_FAILED )
317 {
318 // ToWChar() returns the buffer length, i.e. including the trailing
319 // NUL, while this method doesn't take it into account
320 rc--;
321 }
322
323 return rc;
324 }
325
326 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
327 {
328 size_t rc = FromWChar(out, outLen, in);
329 if ( rc != wxCONV_FAILED )
330 {
331 rc -= GetMBNulLen();
332 }
333
334 return rc;
335 }
336
337 wxMBConv::~wxMBConv()
338 {
339 // nothing to do here (necessary for Darwin linking probably)
340 }
341
342 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
343 {
344 if ( psz )
345 {
346 // calculate the length of the buffer needed first
347 const size_t nLen = MB2WC(NULL, psz, 0);
348 if ( nLen != wxCONV_FAILED )
349 {
350 // now do the actual conversion
351 wxWCharBuffer buf(nLen /* +1 added implicitly */);
352
353 // +1 for the trailing NULL
354 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
355 return buf;
356 }
357 }
358
359 return wxWCharBuffer();
360 }
361
362 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
363 {
364 if ( pwz )
365 {
366 const size_t nLen = WC2MB(NULL, pwz, 0);
367 if ( nLen != wxCONV_FAILED )
368 {
369 // extra space for trailing NUL(s)
370 static const size_t extraLen = GetMaxMBNulLen();
371
372 wxCharBuffer buf(nLen + extraLen - 1);
373 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
374 return buf;
375 }
376 }
377
378 return wxCharBuffer();
379 }
380
381 const wxWCharBuffer
382 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
383 {
384 const size_t dstLen = ToWChar(NULL, 0, in, inLen);
385 if ( dstLen != wxCONV_FAILED )
386 {
387 wxWCharBuffer wbuf(dstLen - 1);
388 if ( ToWChar(wbuf.data(), dstLen, in, inLen) )
389 {
390 if ( outLen )
391 *outLen = dstLen - 1;
392 return wbuf;
393 }
394 }
395
396 if ( outLen )
397 *outLen = 0;
398
399 return wxWCharBuffer();
400 }
401
402 const wxCharBuffer
403 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
404 {
405 const size_t dstLen = FromWChar(NULL, 0, in, inLen);
406 if ( dstLen != wxCONV_FAILED )
407 {
408 wxCharBuffer buf(dstLen - 1);
409 if ( FromWChar(buf.data(), dstLen, in, inLen) )
410 {
411 if ( outLen )
412 *outLen = dstLen - 1;
413 return buf;
414 }
415 }
416
417 if ( outLen )
418 *outLen = 0;
419
420 return wxCharBuffer();
421 }
422
423 // ----------------------------------------------------------------------------
424 // wxMBConvLibc
425 // ----------------------------------------------------------------------------
426
427 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
428 {
429 return wxMB2WC(buf, psz, n);
430 }
431
432 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
433 {
434 return wxWC2MB(buf, psz, n);
435 }
436
437 // ----------------------------------------------------------------------------
438 // wxConvBrokenFileNames
439 // ----------------------------------------------------------------------------
440
441 #ifdef __UNIX__
442
443 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
444 {
445 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
446 || wxStricmp(charset, _T("UTF8")) == 0 )
447 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
448 else
449 m_conv = new wxCSConv(charset);
450 }
451
452 #endif // __UNIX__
453
454 // ----------------------------------------------------------------------------
455 // UTF-7
456 // ----------------------------------------------------------------------------
457
458 // Implementation (C) 2004 Fredrik Roubert
459
460 //
461 // BASE64 decoding table
462 //
463 static const unsigned char utf7unb64[] =
464 {
465 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
466 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
467 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
468 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
469 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
470 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
471 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
472 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
473 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
474 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
475 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
476 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
477 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
478 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
479 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
480 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
481 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
482 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
483 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
484 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
485 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
486 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
487 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
488 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
489 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
490 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
497 };
498
499 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
500 {
501 size_t len = 0;
502
503 while ( *psz && (!buf || (len < n)) )
504 {
505 unsigned char cc = *psz++;
506 if (cc != '+')
507 {
508 // plain ASCII char
509 if (buf)
510 *buf++ = cc;
511 len++;
512 }
513 else if (*psz == '-')
514 {
515 // encoded plus sign
516 if (buf)
517 *buf++ = cc;
518 len++;
519 psz++;
520 }
521 else // start of BASE64 encoded string
522 {
523 bool lsb, ok;
524 unsigned int d, l;
525 for ( ok = lsb = false, d = 0, l = 0;
526 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
527 psz++ )
528 {
529 d <<= 6;
530 d += cc;
531 for (l += 6; l >= 8; lsb = !lsb)
532 {
533 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
534 if (lsb)
535 {
536 if (buf)
537 *buf++ |= c;
538 len ++;
539 }
540 else
541 {
542 if (buf)
543 *buf = (wchar_t)(c << 8);
544 }
545
546 ok = true;
547 }
548 }
549
550 if ( !ok )
551 {
552 // in valid UTF7 we should have valid characters after '+'
553 return (size_t)-1;
554 }
555
556 if (*psz == '-')
557 psz++;
558 }
559 }
560
561 if ( buf && (len < n) )
562 *buf = '\0';
563
564 return len;
565 }
566
567 //
568 // BASE64 encoding table
569 //
570 static const unsigned char utf7enb64[] =
571 {
572 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
573 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
574 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
575 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
576 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
577 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
578 'w', 'x', 'y', 'z', '0', '1', '2', '3',
579 '4', '5', '6', '7', '8', '9', '+', '/'
580 };
581
582 //
583 // UTF-7 encoding table
584 //
585 // 0 - Set D (directly encoded characters)
586 // 1 - Set O (optional direct characters)
587 // 2 - whitespace characters (optional)
588 // 3 - special characters
589 //
590 static const unsigned char utf7encode[128] =
591 {
592 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
593 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
594 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
596 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
598 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
600 };
601
602 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
603 {
604 size_t len = 0;
605
606 while (*psz && ((!buf) || (len < n)))
607 {
608 wchar_t cc = *psz++;
609 if (cc < 0x80 && utf7encode[cc] < 1)
610 {
611 // plain ASCII char
612 if (buf)
613 *buf++ = (char)cc;
614 len++;
615 }
616 #ifndef WC_UTF16
617 else if (((wxUint32)cc) > 0xffff)
618 {
619 // no surrogate pair generation (yet?)
620 return (size_t)-1;
621 }
622 #endif
623 else
624 {
625 if (buf)
626 *buf++ = '+';
627 len++;
628 if (cc != '+')
629 {
630 // BASE64 encode string
631 unsigned int lsb, d, l;
632 for (d = 0, l = 0; /*nothing*/; psz++)
633 {
634 for (lsb = 0; lsb < 2; lsb ++)
635 {
636 d <<= 8;
637 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
638
639 for (l += 8; l >= 6; )
640 {
641 l -= 6;
642 if (buf)
643 *buf++ = utf7enb64[(d >> l) % 64];
644 len++;
645 }
646 }
647 cc = *psz;
648 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
649 break;
650 }
651 if (l != 0)
652 {
653 if (buf)
654 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
655 len++;
656 }
657 }
658 if (buf)
659 *buf++ = '-';
660 len++;
661 }
662 }
663 if (buf && (len < n))
664 *buf = 0;
665 return len;
666 }
667
668 // ----------------------------------------------------------------------------
669 // UTF-8
670 // ----------------------------------------------------------------------------
671
672 static wxUint32 utf8_max[]=
673 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
674
675 // boundaries of the private use area we use to (temporarily) remap invalid
676 // characters invalid in a UTF-8 encoded string
677 const wxUint32 wxUnicodePUA = 0x100000;
678 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
679
680 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
681 {
682 size_t len = 0;
683
684 while (*psz && ((!buf) || (len < n)))
685 {
686 const char *opsz = psz;
687 bool invalid = false;
688 unsigned char cc = *psz++, fc = cc;
689 unsigned cnt;
690 for (cnt = 0; fc & 0x80; cnt++)
691 fc <<= 1;
692 if (!cnt)
693 {
694 // plain ASCII char
695 if (buf)
696 *buf++ = cc;
697 len++;
698
699 // escape the escape character for octal escapes
700 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
701 && cc == '\\' && (!buf || len < n))
702 {
703 if (buf)
704 *buf++ = cc;
705 len++;
706 }
707 }
708 else
709 {
710 cnt--;
711 if (!cnt)
712 {
713 // invalid UTF-8 sequence
714 invalid = true;
715 }
716 else
717 {
718 unsigned ocnt = cnt - 1;
719 wxUint32 res = cc & (0x3f >> cnt);
720 while (cnt--)
721 {
722 cc = *psz;
723 if ((cc & 0xC0) != 0x80)
724 {
725 // invalid UTF-8 sequence
726 invalid = true;
727 break;
728 }
729 psz++;
730 res = (res << 6) | (cc & 0x3f);
731 }
732 if (invalid || res <= utf8_max[ocnt])
733 {
734 // illegal UTF-8 encoding
735 invalid = true;
736 }
737 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
738 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
739 {
740 // if one of our PUA characters turns up externally
741 // it must also be treated as an illegal sequence
742 // (a bit like you have to escape an escape character)
743 invalid = true;
744 }
745 else
746 {
747 #ifdef WC_UTF16
748 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
749 size_t pa = encode_utf16(res, (wxUint16 *)buf);
750 if (pa == (size_t)-1)
751 {
752 invalid = true;
753 }
754 else
755 {
756 if (buf)
757 buf += pa;
758 len += pa;
759 }
760 #else // !WC_UTF16
761 if (buf)
762 *buf++ = (wchar_t)res;
763 len++;
764 #endif // WC_UTF16/!WC_UTF16
765 }
766 }
767 if (invalid)
768 {
769 if (m_options & MAP_INVALID_UTF8_TO_PUA)
770 {
771 while (opsz < psz && (!buf || len < n))
772 {
773 #ifdef WC_UTF16
774 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
775 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
776 wxASSERT(pa != (size_t)-1);
777 if (buf)
778 buf += pa;
779 opsz++;
780 len += pa;
781 #else
782 if (buf)
783 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
784 opsz++;
785 len++;
786 #endif
787 }
788 }
789 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
790 {
791 while (opsz < psz && (!buf || len < n))
792 {
793 if ( buf && len + 3 < n )
794 {
795 unsigned char on = *opsz;
796 *buf++ = L'\\';
797 *buf++ = (wchar_t)( L'0' + on / 0100 );
798 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
799 *buf++ = (wchar_t)( L'0' + on % 010 );
800 }
801 opsz++;
802 len += 4;
803 }
804 }
805 else // MAP_INVALID_UTF8_NOT
806 {
807 return (size_t)-1;
808 }
809 }
810 }
811 }
812 if (buf && (len < n))
813 *buf = 0;
814 return len;
815 }
816
817 static inline bool isoctal(wchar_t wch)
818 {
819 return L'0' <= wch && wch <= L'7';
820 }
821
822 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
823 {
824 size_t len = 0;
825
826 while (*psz && ((!buf) || (len < n)))
827 {
828 wxUint32 cc;
829 #ifdef WC_UTF16
830 // cast is ok for WC_UTF16
831 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
832 psz += (pa == (size_t)-1) ? 1 : pa;
833 #else
834 cc=(*psz++) & 0x7fffffff;
835 #endif
836
837 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
838 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
839 {
840 if (buf)
841 *buf++ = (char)(cc - wxUnicodePUA);
842 len++;
843 }
844 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
845 && cc == L'\\' && psz[0] == L'\\' )
846 {
847 if (buf)
848 *buf++ = (char)cc;
849 psz++;
850 len++;
851 }
852 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
853 cc == L'\\' &&
854 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
855 {
856 if (buf)
857 {
858 *buf++ = (char) ((psz[0] - L'0')*0100 +
859 (psz[1] - L'0')*010 +
860 (psz[2] - L'0'));
861 }
862
863 psz += 3;
864 len++;
865 }
866 else
867 {
868 unsigned cnt;
869 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
870 if (!cnt)
871 {
872 // plain ASCII char
873 if (buf)
874 *buf++ = (char) cc;
875 len++;
876 }
877
878 else
879 {
880 len += cnt + 1;
881 if (buf)
882 {
883 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
884 while (cnt--)
885 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
886 }
887 }
888 }
889 }
890
891 if (buf && (len<n))
892 *buf = 0;
893
894 return len;
895 }
896
897 // ----------------------------------------------------------------------------
898 // UTF-16
899 // ----------------------------------------------------------------------------
900
901 #ifdef WORDS_BIGENDIAN
902 #define wxMBConvUTF16straight wxMBConvUTF16BE
903 #define wxMBConvUTF16swap wxMBConvUTF16LE
904 #else
905 #define wxMBConvUTF16swap wxMBConvUTF16BE
906 #define wxMBConvUTF16straight wxMBConvUTF16LE
907 #endif
908
909
910 #ifdef WC_UTF16
911
912 // copy 16bit MB to 16bit String
913 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
914 {
915 size_t len=0;
916
917 while (*(wxUint16*)psz && (!buf || len < n))
918 {
919 if (buf)
920 *buf++ = *(wxUint16*)psz;
921 len++;
922
923 psz += sizeof(wxUint16);
924 }
925 if (buf && len<n) *buf=0;
926
927 return len;
928 }
929
930
931 // copy 16bit String to 16bit MB
932 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
933 {
934 size_t len=0;
935
936 while (*psz && (!buf || len < n))
937 {
938 if (buf)
939 {
940 *(wxUint16*)buf = *psz;
941 buf += sizeof(wxUint16);
942 }
943 len += sizeof(wxUint16);
944 psz++;
945 }
946 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
947
948 return len;
949 }
950
951
952 // swap 16bit MB to 16bit String
953 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
954 {
955 size_t len = 0;
956
957 // UTF16 string must be terminated by 2 NULs as single NULs may occur
958 // inside the string
959 while ( (psz[0] || psz[1]) && (!buf || len < n) )
960 {
961 if ( buf )
962 {
963 ((char *)buf)[0] = psz[1];
964 ((char *)buf)[1] = psz[0];
965 buf++;
966 }
967 len++;
968 psz += 2;
969 }
970
971 if ( buf && len < n )
972 *buf = L'\0';
973
974 return len;
975 }
976
977
978 // swap 16bit MB to 16bit String
979 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
980 {
981 size_t len = 0;
982
983 while ( *psz && (!buf || len < n) )
984 {
985 if ( buf )
986 {
987 *buf++ = ((char*)psz)[1];
988 *buf++ = ((char*)psz)[0];
989 }
990 len += 2;
991 psz++;
992 }
993
994 if ( buf && len < n )
995 *buf = '\0';
996
997 return len;
998 }
999
1000
1001 #else // WC_UTF16
1002
1003
1004 // copy 16bit MB to 32bit String
1005 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1006 {
1007 size_t len=0;
1008
1009 while (*(wxUint16*)psz && (!buf || len < n))
1010 {
1011 wxUint32 cc;
1012 size_t pa=decode_utf16((wxUint16*)psz, cc);
1013 if (pa == (size_t)-1)
1014 return pa;
1015
1016 if (buf)
1017 *buf++ = (wchar_t)cc;
1018 len++;
1019 psz += pa * sizeof(wxUint16);
1020 }
1021 if (buf && len<n) *buf=0;
1022
1023 return len;
1024 }
1025
1026
1027 // copy 32bit String to 16bit MB
1028 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1029 {
1030 size_t len=0;
1031
1032 while (*psz && (!buf || len < n))
1033 {
1034 wxUint16 cc[2];
1035 size_t pa=encode_utf16(*psz, cc);
1036
1037 if (pa == (size_t)-1)
1038 return pa;
1039
1040 if (buf)
1041 {
1042 *(wxUint16*)buf = cc[0];
1043 buf += sizeof(wxUint16);
1044 if (pa > 1)
1045 {
1046 *(wxUint16*)buf = cc[1];
1047 buf += sizeof(wxUint16);
1048 }
1049 }
1050
1051 len += pa*sizeof(wxUint16);
1052 psz++;
1053 }
1054 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1055
1056 return len;
1057 }
1058
1059
1060 // swap 16bit MB to 32bit String
1061 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1062 {
1063 size_t len=0;
1064
1065 while (*(wxUint16*)psz && (!buf || len < n))
1066 {
1067 wxUint32 cc;
1068 char tmp[4];
1069 tmp[0]=psz[1]; tmp[1]=psz[0];
1070 tmp[2]=psz[3]; tmp[3]=psz[2];
1071
1072 size_t pa=decode_utf16((wxUint16*)tmp, cc);
1073 if (pa == (size_t)-1)
1074 return pa;
1075
1076 if (buf)
1077 *buf++ = (wchar_t)cc;
1078
1079 len++;
1080 psz += pa * sizeof(wxUint16);
1081 }
1082 if (buf && len<n) *buf=0;
1083
1084 return len;
1085 }
1086
1087
1088 // swap 32bit String to 16bit MB
1089 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1090 {
1091 size_t len=0;
1092
1093 while (*psz && (!buf || len < n))
1094 {
1095 wxUint16 cc[2];
1096 size_t pa=encode_utf16(*psz, cc);
1097
1098 if (pa == (size_t)-1)
1099 return pa;
1100
1101 if (buf)
1102 {
1103 *buf++ = ((char*)cc)[1];
1104 *buf++ = ((char*)cc)[0];
1105 if (pa > 1)
1106 {
1107 *buf++ = ((char*)cc)[3];
1108 *buf++ = ((char*)cc)[2];
1109 }
1110 }
1111
1112 len += pa*sizeof(wxUint16);
1113 psz++;
1114 }
1115 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1116
1117 return len;
1118 }
1119
1120 #endif // WC_UTF16
1121
1122
1123 // ----------------------------------------------------------------------------
1124 // UTF-32
1125 // ----------------------------------------------------------------------------
1126
1127 #ifdef WORDS_BIGENDIAN
1128 #define wxMBConvUTF32straight wxMBConvUTF32BE
1129 #define wxMBConvUTF32swap wxMBConvUTF32LE
1130 #else
1131 #define wxMBConvUTF32swap wxMBConvUTF32BE
1132 #define wxMBConvUTF32straight wxMBConvUTF32LE
1133 #endif
1134
1135
1136 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1137 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1138
1139
1140 #ifdef WC_UTF16
1141
1142 // copy 32bit MB to 16bit String
1143 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1144 {
1145 size_t len=0;
1146
1147 while (*(wxUint32*)psz && (!buf || len < n))
1148 {
1149 wxUint16 cc[2];
1150
1151 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1152 if (pa == (size_t)-1)
1153 return pa;
1154
1155 if (buf)
1156 {
1157 *buf++ = cc[0];
1158 if (pa > 1)
1159 *buf++ = cc[1];
1160 }
1161 len += pa;
1162 psz += sizeof(wxUint32);
1163 }
1164 if (buf && len<n) *buf=0;
1165
1166 return len;
1167 }
1168
1169
1170 // copy 16bit String to 32bit MB
1171 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1172 {
1173 size_t len=0;
1174
1175 while (*psz && (!buf || len < n))
1176 {
1177 wxUint32 cc;
1178
1179 // cast is ok for WC_UTF16
1180 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1181 if (pa == (size_t)-1)
1182 return pa;
1183
1184 if (buf)
1185 {
1186 *(wxUint32*)buf = cc;
1187 buf += sizeof(wxUint32);
1188 }
1189 len += sizeof(wxUint32);
1190 psz += pa;
1191 }
1192
1193 if (buf && len<=n-sizeof(wxUint32))
1194 *(wxUint32*)buf=0;
1195
1196 return len;
1197 }
1198
1199
1200
1201 // swap 32bit MB to 16bit String
1202 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1203 {
1204 size_t len=0;
1205
1206 while (*(wxUint32*)psz && (!buf || len < n))
1207 {
1208 char tmp[4];
1209 tmp[0] = psz[3]; tmp[1] = psz[2];
1210 tmp[2] = psz[1]; tmp[3] = psz[0];
1211
1212
1213 wxUint16 cc[2];
1214
1215 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1216 if (pa == (size_t)-1)
1217 return pa;
1218
1219 if (buf)
1220 {
1221 *buf++ = cc[0];
1222 if (pa > 1)
1223 *buf++ = cc[1];
1224 }
1225 len += pa;
1226 psz += sizeof(wxUint32);
1227 }
1228
1229 if (buf && len<n)
1230 *buf=0;
1231
1232 return len;
1233 }
1234
1235
1236 // swap 16bit String to 32bit MB
1237 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1238 {
1239 size_t len=0;
1240
1241 while (*psz && (!buf || len < n))
1242 {
1243 char cc[4];
1244
1245 // cast is ok for WC_UTF16
1246 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1247 if (pa == (size_t)-1)
1248 return pa;
1249
1250 if (buf)
1251 {
1252 *buf++ = cc[3];
1253 *buf++ = cc[2];
1254 *buf++ = cc[1];
1255 *buf++ = cc[0];
1256 }
1257 len += sizeof(wxUint32);
1258 psz += pa;
1259 }
1260
1261 if (buf && len<=n-sizeof(wxUint32))
1262 *(wxUint32*)buf=0;
1263
1264 return len;
1265 }
1266
1267 #else // WC_UTF16
1268
1269
1270 // copy 32bit MB to 32bit String
1271 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1272 {
1273 size_t len=0;
1274
1275 while (*(wxUint32*)psz && (!buf || len < n))
1276 {
1277 if (buf)
1278 *buf++ = (wchar_t)(*(wxUint32*)psz);
1279 len++;
1280 psz += sizeof(wxUint32);
1281 }
1282
1283 if (buf && len<n)
1284 *buf=0;
1285
1286 return len;
1287 }
1288
1289
1290 // copy 32bit String to 32bit MB
1291 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1292 {
1293 size_t len=0;
1294
1295 while (*psz && (!buf || len < n))
1296 {
1297 if (buf)
1298 {
1299 *(wxUint32*)buf = *psz;
1300 buf += sizeof(wxUint32);
1301 }
1302
1303 len += sizeof(wxUint32);
1304 psz++;
1305 }
1306
1307 if (buf && len<=n-sizeof(wxUint32))
1308 *(wxUint32*)buf=0;
1309
1310 return len;
1311 }
1312
1313
1314 // swap 32bit MB to 32bit String
1315 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1316 {
1317 size_t len=0;
1318
1319 while (*(wxUint32*)psz && (!buf || len < n))
1320 {
1321 if (buf)
1322 {
1323 ((char *)buf)[0] = psz[3];
1324 ((char *)buf)[1] = psz[2];
1325 ((char *)buf)[2] = psz[1];
1326 ((char *)buf)[3] = psz[0];
1327 buf++;
1328 }
1329 len++;
1330 psz += sizeof(wxUint32);
1331 }
1332
1333 if (buf && len<n)
1334 *buf=0;
1335
1336 return len;
1337 }
1338
1339
1340 // swap 32bit String to 32bit MB
1341 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1342 {
1343 size_t len=0;
1344
1345 while (*psz && (!buf || len < n))
1346 {
1347 if (buf)
1348 {
1349 *buf++ = ((char *)psz)[3];
1350 *buf++ = ((char *)psz)[2];
1351 *buf++ = ((char *)psz)[1];
1352 *buf++ = ((char *)psz)[0];
1353 }
1354 len += sizeof(wxUint32);
1355 psz++;
1356 }
1357
1358 if (buf && len<=n-sizeof(wxUint32))
1359 *(wxUint32*)buf=0;
1360
1361 return len;
1362 }
1363
1364
1365 #endif // WC_UTF16
1366
1367
1368 // ============================================================================
1369 // The classes doing conversion using the iconv_xxx() functions
1370 // ============================================================================
1371
1372 #ifdef HAVE_ICONV
1373
1374 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1375 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1376 // (unless there's yet another bug in glibc) the only case when iconv()
1377 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1378 // left in the input buffer -- when _real_ error occurs,
1379 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1380 // iconv() failure.
1381 // [This bug does not appear in glibc 2.2.]
1382 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1383 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1384 (errno != E2BIG || bufLeft != 0))
1385 #else
1386 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1387 #endif
1388
1389 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1390
1391 #define ICONV_T_INVALID ((iconv_t)-1)
1392
1393 #if SIZEOF_WCHAR_T == 4
1394 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1395 #define WC_ENC wxFONTENCODING_UTF32
1396 #elif SIZEOF_WCHAR_T == 2
1397 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1398 #define WC_ENC wxFONTENCODING_UTF16
1399 #else // sizeof(wchar_t) != 2 nor 4
1400 // does this ever happen?
1401 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1402 #endif
1403
1404 // ----------------------------------------------------------------------------
1405 // wxMBConv_iconv: encapsulates an iconv character set
1406 // ----------------------------------------------------------------------------
1407
1408 class wxMBConv_iconv : public wxMBConv
1409 {
1410 public:
1411 wxMBConv_iconv(const wxChar *name);
1412 virtual ~wxMBConv_iconv();
1413
1414 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1415 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1416
1417 // classify this encoding as explained in wxMBConv::GetMBNulLen()
1418 // comment
1419 virtual size_t GetMBNulLen() const;
1420
1421 bool IsOk() const
1422 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1423
1424 protected:
1425 // the iconv handlers used to translate from multibyte to wide char and in
1426 // the other direction
1427 iconv_t m2w,
1428 w2m;
1429 #if wxUSE_THREADS
1430 // guards access to m2w and w2m objects
1431 wxMutex m_iconvMutex;
1432 #endif
1433
1434 private:
1435 // the name (for iconv_open()) of a wide char charset -- if none is
1436 // available on this machine, it will remain NULL
1437 static wxString ms_wcCharsetName;
1438
1439 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1440 // different endian-ness than the native one
1441 static bool ms_wcNeedsSwap;
1442
1443 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1444 // initially
1445 size_t m_minMBCharWidth;
1446 };
1447
1448 // make the constructor available for unit testing
1449 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1450 {
1451 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1452 if ( !result->IsOk() )
1453 {
1454 delete result;
1455 return 0;
1456 }
1457 return result;
1458 }
1459
1460 wxString wxMBConv_iconv::ms_wcCharsetName;
1461 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1462
1463 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1464 {
1465 m_minMBCharWidth = 0;
1466
1467 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1468 // names for the charsets
1469 const wxCharBuffer cname(wxString(name).ToAscii());
1470
1471 // check for charset that represents wchar_t:
1472 if ( ms_wcCharsetName.empty() )
1473 {
1474 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1475
1476 #if wxUSE_FONTMAP
1477 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1478 #else // !wxUSE_FONTMAP
1479 static const wxChar *names[] =
1480 {
1481 #if SIZEOF_WCHAR_T == 4
1482 _T("UCS-4"),
1483 #elif SIZEOF_WCHAR_T = 2
1484 _T("UCS-2"),
1485 #endif
1486 NULL
1487 };
1488 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1489
1490 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1491 {
1492 const wxString nameCS(*names);
1493
1494 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1495 wxString nameXE(nameCS);
1496 #ifdef WORDS_BIGENDIAN
1497 nameXE += _T("BE");
1498 #else // little endian
1499 nameXE += _T("LE");
1500 #endif
1501
1502 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1503 nameXE.c_str());
1504
1505 m2w = iconv_open(nameXE.ToAscii(), cname);
1506 if ( m2w == ICONV_T_INVALID )
1507 {
1508 // try charset w/o bytesex info (e.g. "UCS4")
1509 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1510 nameCS.c_str());
1511 m2w = iconv_open(nameCS.ToAscii(), cname);
1512
1513 // and check for bytesex ourselves:
1514 if ( m2w != ICONV_T_INVALID )
1515 {
1516 char buf[2], *bufPtr;
1517 wchar_t wbuf[2], *wbufPtr;
1518 size_t insz, outsz;
1519 size_t res;
1520
1521 buf[0] = 'A';
1522 buf[1] = 0;
1523 wbuf[0] = 0;
1524 insz = 2;
1525 outsz = SIZEOF_WCHAR_T * 2;
1526 wbufPtr = wbuf;
1527 bufPtr = buf;
1528
1529 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1530 (char**)&wbufPtr, &outsz);
1531
1532 if (ICONV_FAILED(res, insz))
1533 {
1534 wxLogLastError(wxT("iconv"));
1535 wxLogError(_("Conversion to charset '%s' doesn't work."),
1536 nameCS.c_str());
1537 }
1538 else // ok, can convert to this encoding, remember it
1539 {
1540 ms_wcCharsetName = nameCS;
1541 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1542 }
1543 }
1544 }
1545 else // use charset not requiring byte swapping
1546 {
1547 ms_wcCharsetName = nameXE;
1548 }
1549 }
1550
1551 wxLogTrace(TRACE_STRCONV,
1552 wxT("iconv wchar_t charset is \"%s\"%s"),
1553 ms_wcCharsetName.empty() ? _T("<none>")
1554 : ms_wcCharsetName.c_str(),
1555 ms_wcNeedsSwap ? _T(" (needs swap)")
1556 : _T(""));
1557 }
1558 else // we already have ms_wcCharsetName
1559 {
1560 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1561 }
1562
1563 if ( ms_wcCharsetName.empty() )
1564 {
1565 w2m = ICONV_T_INVALID;
1566 }
1567 else
1568 {
1569 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1570 if ( w2m == ICONV_T_INVALID )
1571 {
1572 wxLogTrace(TRACE_STRCONV,
1573 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1574 ms_wcCharsetName.c_str(), cname.data());
1575 }
1576 }
1577 }
1578
1579 wxMBConv_iconv::~wxMBConv_iconv()
1580 {
1581 if ( m2w != ICONV_T_INVALID )
1582 iconv_close(m2w);
1583 if ( w2m != ICONV_T_INVALID )
1584 iconv_close(w2m);
1585 }
1586
1587 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1588 {
1589 // find the string length: notice that must be done differently for
1590 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1591 size_t inbuf;
1592 const size_t nulLen = GetMBNulLen();
1593 switch ( nulLen )
1594 {
1595 default:
1596 return (size_t)-1;
1597
1598 case 1:
1599 inbuf = strlen(psz); // arguably more optimized than our version
1600 break;
1601
1602 case 2:
1603 case 4:
1604 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1605 // they also have to start at character boundary and not span two
1606 // adjacent characters
1607 const char *p;
1608 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1609 ;
1610 inbuf = p - psz;
1611 break;
1612 }
1613
1614 #if wxUSE_THREADS
1615 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1616 // Unfortunately there is a couple of global wxCSConv objects such as
1617 // wxConvLocal that are used all over wx code, so we have to make sure
1618 // the handle is used by at most one thread at the time. Otherwise
1619 // only a few wx classes would be safe to use from non-main threads
1620 // as MB<->WC conversion would fail "randomly".
1621 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1622 #endif // wxUSE_THREADS
1623
1624
1625 size_t outbuf = n * SIZEOF_WCHAR_T;
1626 size_t res, cres;
1627 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1628 wchar_t *bufPtr = buf;
1629 const char *pszPtr = psz;
1630
1631 if (buf)
1632 {
1633 // have destination buffer, convert there
1634 cres = iconv(m2w,
1635 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1636 (char**)&bufPtr, &outbuf);
1637 res = n - (outbuf / SIZEOF_WCHAR_T);
1638
1639 if (ms_wcNeedsSwap)
1640 {
1641 // convert to native endianness
1642 for ( unsigned i = 0; i < res; i++ )
1643 buf[n] = WC_BSWAP(buf[i]);
1644 }
1645
1646 // NUL-terminate the string if there is any space left
1647 if (res < n)
1648 buf[res] = 0;
1649 }
1650 else
1651 {
1652 // no destination buffer... convert using temp buffer
1653 // to calculate destination buffer requirement
1654 wchar_t tbuf[8];
1655 res = 0;
1656 do {
1657 bufPtr = tbuf;
1658 outbuf = 8*SIZEOF_WCHAR_T;
1659
1660 cres = iconv(m2w,
1661 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1662 (char**)&bufPtr, &outbuf );
1663
1664 res += 8-(outbuf/SIZEOF_WCHAR_T);
1665 } while ((cres==(size_t)-1) && (errno==E2BIG));
1666 }
1667
1668 if (ICONV_FAILED(cres, inbuf))
1669 {
1670 //VS: it is ok if iconv fails, hence trace only
1671 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1672 return (size_t)-1;
1673 }
1674
1675 return res;
1676 }
1677
1678 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1679 {
1680 #if wxUSE_THREADS
1681 // NB: explained in MB2WC
1682 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1683 #endif
1684
1685 size_t inlen = wxWcslen(psz);
1686 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1687 size_t outbuf = n;
1688 size_t res, cres;
1689
1690 wchar_t *tmpbuf = 0;
1691
1692 if (ms_wcNeedsSwap)
1693 {
1694 // need to copy to temp buffer to switch endianness
1695 // (doing WC_BSWAP twice on the original buffer won't help, as it
1696 // could be in read-only memory, or be accessed in some other thread)
1697 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1698 for ( size_t i = 0; i < inlen; i++ )
1699 tmpbuf[n] = WC_BSWAP(psz[i]);
1700 tmpbuf[inlen] = L'\0';
1701 psz = tmpbuf;
1702 }
1703
1704 if (buf)
1705 {
1706 // have destination buffer, convert there
1707 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1708
1709 res = n-outbuf;
1710
1711 // NB: iconv was given only wcslen(psz) characters on input, and so
1712 // it couldn't convert the trailing zero. Let's do it ourselves
1713 // if there's some room left for it in the output buffer.
1714 if (res < n)
1715 buf[0] = 0;
1716 }
1717 else
1718 {
1719 // no destination buffer... convert using temp buffer
1720 // to calculate destination buffer requirement
1721 char tbuf[16];
1722 res = 0;
1723 do {
1724 buf = tbuf; outbuf = 16;
1725
1726 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1727
1728 res += 16 - outbuf;
1729 } while ((cres==(size_t)-1) && (errno==E2BIG));
1730 }
1731
1732 if (ms_wcNeedsSwap)
1733 {
1734 free(tmpbuf);
1735 }
1736
1737 if (ICONV_FAILED(cres, inbuf))
1738 {
1739 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1740 return (size_t)-1;
1741 }
1742
1743 return res;
1744 }
1745
1746 size_t wxMBConv_iconv::GetMBNulLen() const
1747 {
1748 if ( m_minMBCharWidth == 0 )
1749 {
1750 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1751
1752 #if wxUSE_THREADS
1753 // NB: explained in MB2WC
1754 wxMutexLocker lock(self->m_iconvMutex);
1755 #endif
1756
1757 wchar_t *wnul = L"";
1758 char buf[8]; // should be enough for NUL in any encoding
1759 size_t inLen = sizeof(wchar_t),
1760 outLen = WXSIZEOF(buf);
1761 char *in = (char *)wnul;
1762 char *out = buf;
1763 if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1764 {
1765 self->m_minMBCharWidth = (size_t)-1;
1766 }
1767 else // ok
1768 {
1769 self->m_minMBCharWidth = out - buf;
1770 }
1771 }
1772
1773 return m_minMBCharWidth;
1774 }
1775
1776 #endif // HAVE_ICONV
1777
1778
1779 // ============================================================================
1780 // Win32 conversion classes
1781 // ============================================================================
1782
1783 #ifdef wxHAVE_WIN32_MB2WC
1784
1785 // from utils.cpp
1786 #if wxUSE_FONTMAP
1787 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1788 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1789 #endif
1790
1791 class wxMBConv_win32 : public wxMBConv
1792 {
1793 public:
1794 wxMBConv_win32()
1795 {
1796 m_CodePage = CP_ACP;
1797 m_minMBCharWidth = 0;
1798 }
1799
1800 #if wxUSE_FONTMAP
1801 wxMBConv_win32(const wxChar* name)
1802 {
1803 m_CodePage = wxCharsetToCodepage(name);
1804 m_minMBCharWidth = 0;
1805 }
1806
1807 wxMBConv_win32(wxFontEncoding encoding)
1808 {
1809 m_CodePage = wxEncodingToCodepage(encoding);
1810 m_minMBCharWidth = 0;
1811 }
1812 #endif // wxUSE_FONTMAP
1813
1814 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1815 {
1816 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1817 // the behaviour is not compatible with the Unix version (using iconv)
1818 // and break the library itself, e.g. wxTextInputStream::NextChar()
1819 // wouldn't work if reading an incomplete MB char didn't result in an
1820 // error
1821 //
1822 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1823 // Win XP or newer and it is not supported for UTF-[78] so we always
1824 // use our own conversions in this case. See
1825 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1826 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1827 if ( m_CodePage == CP_UTF8 )
1828 {
1829 return wxConvUTF8.MB2WC(buf, psz, n);
1830 }
1831
1832 if ( m_CodePage == CP_UTF7 )
1833 {
1834 return wxConvUTF7.MB2WC(buf, psz, n);
1835 }
1836
1837 int flags = 0;
1838 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
1839 IsAtLeastWin2kSP4() )
1840 {
1841 flags = MB_ERR_INVALID_CHARS;
1842 }
1843
1844 const size_t len = ::MultiByteToWideChar
1845 (
1846 m_CodePage, // code page
1847 flags, // flags: fall on error
1848 psz, // input string
1849 -1, // its length (NUL-terminated)
1850 buf, // output string
1851 buf ? n : 0 // size of output buffer
1852 );
1853 if ( !len )
1854 {
1855 // function totally failed
1856 return (size_t)-1;
1857 }
1858
1859 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1860 // check if we succeeded, by doing a double trip:
1861 if ( !flags && buf )
1862 {
1863 const size_t mbLen = strlen(psz);
1864 wxCharBuffer mbBuf(mbLen);
1865 if ( ::WideCharToMultiByte
1866 (
1867 m_CodePage,
1868 0,
1869 buf,
1870 -1,
1871 mbBuf.data(),
1872 mbLen + 1, // size in bytes, not length
1873 NULL,
1874 NULL
1875 ) == 0 ||
1876 strcmp(mbBuf, psz) != 0 )
1877 {
1878 // we didn't obtain the same thing we started from, hence
1879 // the conversion was lossy and we consider that it failed
1880 return (size_t)-1;
1881 }
1882 }
1883
1884 // note that it returns count of written chars for buf != NULL and size
1885 // of the needed buffer for buf == NULL so in either case the length of
1886 // the string (which never includes the terminating NUL) is one less
1887 return len - 1;
1888 }
1889
1890 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1891 {
1892 /*
1893 we have a problem here: by default, WideCharToMultiByte() may
1894 replace characters unrepresentable in the target code page with bad
1895 quality approximations such as turning "1/2" symbol (U+00BD) into
1896 "1" for the code pages which don't have it and we, obviously, want
1897 to avoid this at any price
1898
1899 the trouble is that this function does it _silently_, i.e. it won't
1900 even tell us whether it did or not... Win98/2000 and higher provide
1901 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1902 we have to resort to a round trip, i.e. check that converting back
1903 results in the same string -- this is, of course, expensive but
1904 otherwise we simply can't be sure to not garble the data.
1905 */
1906
1907 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1908 // it doesn't work with CJK encodings (which we test for rather roughly
1909 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1910 // supporting it
1911 BOOL usedDef wxDUMMY_INITIALIZE(false);
1912 BOOL *pUsedDef;
1913 int flags;
1914 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1915 {
1916 // it's our lucky day
1917 flags = WC_NO_BEST_FIT_CHARS;
1918 pUsedDef = &usedDef;
1919 }
1920 else // old system or unsupported encoding
1921 {
1922 flags = 0;
1923 pUsedDef = NULL;
1924 }
1925
1926 const size_t len = ::WideCharToMultiByte
1927 (
1928 m_CodePage, // code page
1929 flags, // either none or no best fit
1930 pwz, // input string
1931 -1, // it is (wide) NUL-terminated
1932 buf, // output buffer
1933 buf ? n : 0, // and its size
1934 NULL, // default "replacement" char
1935 pUsedDef // [out] was it used?
1936 );
1937
1938 if ( !len )
1939 {
1940 // function totally failed
1941 return (size_t)-1;
1942 }
1943
1944 // if we were really converting, check if we succeeded
1945 if ( buf )
1946 {
1947 if ( flags )
1948 {
1949 // check if the conversion failed, i.e. if any replacements
1950 // were done
1951 if ( usedDef )
1952 return (size_t)-1;
1953 }
1954 else // we must resort to double tripping...
1955 {
1956 wxWCharBuffer wcBuf(n);
1957 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1958 wcscmp(wcBuf, pwz) != 0 )
1959 {
1960 // we didn't obtain the same thing we started from, hence
1961 // the conversion was lossy and we consider that it failed
1962 return (size_t)-1;
1963 }
1964 }
1965 }
1966
1967 // see the comment above for the reason of "len - 1"
1968 return len - 1;
1969 }
1970
1971 virtual size_t GetMBNulLen() const
1972 {
1973 if ( m_minMBCharWidth == 0 )
1974 {
1975 int len = ::WideCharToMultiByte
1976 (
1977 m_CodePage, // code page
1978 0, // no flags
1979 L"", // input string
1980 1, // translate just the NUL
1981 NULL, // output buffer
1982 0, // and its size
1983 NULL, // no replacement char
1984 NULL // [out] don't care if it was used
1985 );
1986
1987 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1988 switch ( len )
1989 {
1990 default:
1991 wxLogDebug(_T("Unexpected NUL length %d"), len);
1992 // fall through
1993
1994 case 0:
1995 self->m_minMBCharWidth = (size_t)-1;
1996 break;
1997
1998 case 1:
1999 case 2:
2000 case 4:
2001 self->m_minMBCharWidth = len;
2002 break;
2003 }
2004 }
2005
2006 return m_minMBCharWidth;
2007 }
2008
2009 bool IsOk() const { return m_CodePage != -1; }
2010
2011 private:
2012 static bool CanUseNoBestFit()
2013 {
2014 static int s_isWin98Or2k = -1;
2015
2016 if ( s_isWin98Or2k == -1 )
2017 {
2018 int verMaj, verMin;
2019 switch ( wxGetOsVersion(&verMaj, &verMin) )
2020 {
2021 case wxWIN95:
2022 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2023 break;
2024
2025 case wxWINDOWS_NT:
2026 s_isWin98Or2k = verMaj >= 5;
2027 break;
2028
2029 default:
2030 // unknown, be conseravtive by default
2031 s_isWin98Or2k = 0;
2032 }
2033
2034 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2035 }
2036
2037 return s_isWin98Or2k == 1;
2038 }
2039
2040 static bool IsAtLeastWin2kSP4()
2041 {
2042 #ifdef __WXWINCE__
2043 return false;
2044 #else
2045 static int s_isAtLeastWin2kSP4 = -1;
2046
2047 if ( s_isAtLeastWin2kSP4 == -1 )
2048 {
2049 OSVERSIONINFOEX ver;
2050
2051 memset(&ver, 0, sizeof(ver));
2052 ver.dwOSVersionInfoSize = sizeof(ver);
2053 GetVersionEx((OSVERSIONINFO*)&ver);
2054
2055 s_isAtLeastWin2kSP4 =
2056 ((ver.dwMajorVersion > 5) || // Vista+
2057 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2058 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2059 ver.wServicePackMajor >= 4)) // 2000 SP4+
2060 ? 1 : 0;
2061 }
2062
2063 return s_isAtLeastWin2kSP4 == 1;
2064 #endif
2065 }
2066
2067
2068 // the code page we're working with
2069 long m_CodePage;
2070
2071 // cached result of GetMBNulLen(), set to 0 initially meaning
2072 // "unknown"
2073 size_t m_minMBCharWidth;
2074 };
2075
2076 #endif // wxHAVE_WIN32_MB2WC
2077
2078 // ============================================================================
2079 // Cocoa conversion classes
2080 // ============================================================================
2081
2082 #if defined(__WXCOCOA__)
2083
2084 // RN: There is no UTF-32 support in either Core Foundation or
2085 // Cocoa. Strangely enough, internally Core Foundation uses
2086 // UTF 32 internally quite a bit - its just not public (yet).
2087
2088 #include <CoreFoundation/CFString.h>
2089 #include <CoreFoundation/CFStringEncodingExt.h>
2090
2091 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2092 {
2093 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2094 if ( encoding == wxFONTENCODING_DEFAULT )
2095 {
2096 enc = CFStringGetSystemEncoding();
2097 }
2098 else switch( encoding)
2099 {
2100 case wxFONTENCODING_ISO8859_1 :
2101 enc = kCFStringEncodingISOLatin1 ;
2102 break ;
2103 case wxFONTENCODING_ISO8859_2 :
2104 enc = kCFStringEncodingISOLatin2;
2105 break ;
2106 case wxFONTENCODING_ISO8859_3 :
2107 enc = kCFStringEncodingISOLatin3 ;
2108 break ;
2109 case wxFONTENCODING_ISO8859_4 :
2110 enc = kCFStringEncodingISOLatin4;
2111 break ;
2112 case wxFONTENCODING_ISO8859_5 :
2113 enc = kCFStringEncodingISOLatinCyrillic;
2114 break ;
2115 case wxFONTENCODING_ISO8859_6 :
2116 enc = kCFStringEncodingISOLatinArabic;
2117 break ;
2118 case wxFONTENCODING_ISO8859_7 :
2119 enc = kCFStringEncodingISOLatinGreek;
2120 break ;
2121 case wxFONTENCODING_ISO8859_8 :
2122 enc = kCFStringEncodingISOLatinHebrew;
2123 break ;
2124 case wxFONTENCODING_ISO8859_9 :
2125 enc = kCFStringEncodingISOLatin5;
2126 break ;
2127 case wxFONTENCODING_ISO8859_10 :
2128 enc = kCFStringEncodingISOLatin6;
2129 break ;
2130 case wxFONTENCODING_ISO8859_11 :
2131 enc = kCFStringEncodingISOLatinThai;
2132 break ;
2133 case wxFONTENCODING_ISO8859_13 :
2134 enc = kCFStringEncodingISOLatin7;
2135 break ;
2136 case wxFONTENCODING_ISO8859_14 :
2137 enc = kCFStringEncodingISOLatin8;
2138 break ;
2139 case wxFONTENCODING_ISO8859_15 :
2140 enc = kCFStringEncodingISOLatin9;
2141 break ;
2142
2143 case wxFONTENCODING_KOI8 :
2144 enc = kCFStringEncodingKOI8_R;
2145 break ;
2146 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2147 enc = kCFStringEncodingDOSRussian;
2148 break ;
2149
2150 // case wxFONTENCODING_BULGARIAN :
2151 // enc = ;
2152 // break ;
2153
2154 case wxFONTENCODING_CP437 :
2155 enc =kCFStringEncodingDOSLatinUS ;
2156 break ;
2157 case wxFONTENCODING_CP850 :
2158 enc = kCFStringEncodingDOSLatin1;
2159 break ;
2160 case wxFONTENCODING_CP852 :
2161 enc = kCFStringEncodingDOSLatin2;
2162 break ;
2163 case wxFONTENCODING_CP855 :
2164 enc = kCFStringEncodingDOSCyrillic;
2165 break ;
2166 case wxFONTENCODING_CP866 :
2167 enc =kCFStringEncodingDOSRussian ;
2168 break ;
2169 case wxFONTENCODING_CP874 :
2170 enc = kCFStringEncodingDOSThai;
2171 break ;
2172 case wxFONTENCODING_CP932 :
2173 enc = kCFStringEncodingDOSJapanese;
2174 break ;
2175 case wxFONTENCODING_CP936 :
2176 enc =kCFStringEncodingDOSChineseSimplif ;
2177 break ;
2178 case wxFONTENCODING_CP949 :
2179 enc = kCFStringEncodingDOSKorean;
2180 break ;
2181 case wxFONTENCODING_CP950 :
2182 enc = kCFStringEncodingDOSChineseTrad;
2183 break ;
2184 case wxFONTENCODING_CP1250 :
2185 enc = kCFStringEncodingWindowsLatin2;
2186 break ;
2187 case wxFONTENCODING_CP1251 :
2188 enc =kCFStringEncodingWindowsCyrillic ;
2189 break ;
2190 case wxFONTENCODING_CP1252 :
2191 enc =kCFStringEncodingWindowsLatin1 ;
2192 break ;
2193 case wxFONTENCODING_CP1253 :
2194 enc = kCFStringEncodingWindowsGreek;
2195 break ;
2196 case wxFONTENCODING_CP1254 :
2197 enc = kCFStringEncodingWindowsLatin5;
2198 break ;
2199 case wxFONTENCODING_CP1255 :
2200 enc =kCFStringEncodingWindowsHebrew ;
2201 break ;
2202 case wxFONTENCODING_CP1256 :
2203 enc =kCFStringEncodingWindowsArabic ;
2204 break ;
2205 case wxFONTENCODING_CP1257 :
2206 enc = kCFStringEncodingWindowsBalticRim;
2207 break ;
2208 // This only really encodes to UTF7 (if that) evidently
2209 // case wxFONTENCODING_UTF7 :
2210 // enc = kCFStringEncodingNonLossyASCII ;
2211 // break ;
2212 case wxFONTENCODING_UTF8 :
2213 enc = kCFStringEncodingUTF8 ;
2214 break ;
2215 case wxFONTENCODING_EUC_JP :
2216 enc = kCFStringEncodingEUC_JP;
2217 break ;
2218 case wxFONTENCODING_UTF16 :
2219 enc = kCFStringEncodingUnicode ;
2220 break ;
2221 case wxFONTENCODING_MACROMAN :
2222 enc = kCFStringEncodingMacRoman ;
2223 break ;
2224 case wxFONTENCODING_MACJAPANESE :
2225 enc = kCFStringEncodingMacJapanese ;
2226 break ;
2227 case wxFONTENCODING_MACCHINESETRAD :
2228 enc = kCFStringEncodingMacChineseTrad ;
2229 break ;
2230 case wxFONTENCODING_MACKOREAN :
2231 enc = kCFStringEncodingMacKorean ;
2232 break ;
2233 case wxFONTENCODING_MACARABIC :
2234 enc = kCFStringEncodingMacArabic ;
2235 break ;
2236 case wxFONTENCODING_MACHEBREW :
2237 enc = kCFStringEncodingMacHebrew ;
2238 break ;
2239 case wxFONTENCODING_MACGREEK :
2240 enc = kCFStringEncodingMacGreek ;
2241 break ;
2242 case wxFONTENCODING_MACCYRILLIC :
2243 enc = kCFStringEncodingMacCyrillic ;
2244 break ;
2245 case wxFONTENCODING_MACDEVANAGARI :
2246 enc = kCFStringEncodingMacDevanagari ;
2247 break ;
2248 case wxFONTENCODING_MACGURMUKHI :
2249 enc = kCFStringEncodingMacGurmukhi ;
2250 break ;
2251 case wxFONTENCODING_MACGUJARATI :
2252 enc = kCFStringEncodingMacGujarati ;
2253 break ;
2254 case wxFONTENCODING_MACORIYA :
2255 enc = kCFStringEncodingMacOriya ;
2256 break ;
2257 case wxFONTENCODING_MACBENGALI :
2258 enc = kCFStringEncodingMacBengali ;
2259 break ;
2260 case wxFONTENCODING_MACTAMIL :
2261 enc = kCFStringEncodingMacTamil ;
2262 break ;
2263 case wxFONTENCODING_MACTELUGU :
2264 enc = kCFStringEncodingMacTelugu ;
2265 break ;
2266 case wxFONTENCODING_MACKANNADA :
2267 enc = kCFStringEncodingMacKannada ;
2268 break ;
2269 case wxFONTENCODING_MACMALAJALAM :
2270 enc = kCFStringEncodingMacMalayalam ;
2271 break ;
2272 case wxFONTENCODING_MACSINHALESE :
2273 enc = kCFStringEncodingMacSinhalese ;
2274 break ;
2275 case wxFONTENCODING_MACBURMESE :
2276 enc = kCFStringEncodingMacBurmese ;
2277 break ;
2278 case wxFONTENCODING_MACKHMER :
2279 enc = kCFStringEncodingMacKhmer ;
2280 break ;
2281 case wxFONTENCODING_MACTHAI :
2282 enc = kCFStringEncodingMacThai ;
2283 break ;
2284 case wxFONTENCODING_MACLAOTIAN :
2285 enc = kCFStringEncodingMacLaotian ;
2286 break ;
2287 case wxFONTENCODING_MACGEORGIAN :
2288 enc = kCFStringEncodingMacGeorgian ;
2289 break ;
2290 case wxFONTENCODING_MACARMENIAN :
2291 enc = kCFStringEncodingMacArmenian ;
2292 break ;
2293 case wxFONTENCODING_MACCHINESESIMP :
2294 enc = kCFStringEncodingMacChineseSimp ;
2295 break ;
2296 case wxFONTENCODING_MACTIBETAN :
2297 enc = kCFStringEncodingMacTibetan ;
2298 break ;
2299 case wxFONTENCODING_MACMONGOLIAN :
2300 enc = kCFStringEncodingMacMongolian ;
2301 break ;
2302 case wxFONTENCODING_MACETHIOPIC :
2303 enc = kCFStringEncodingMacEthiopic ;
2304 break ;
2305 case wxFONTENCODING_MACCENTRALEUR :
2306 enc = kCFStringEncodingMacCentralEurRoman ;
2307 break ;
2308 case wxFONTENCODING_MACVIATNAMESE :
2309 enc = kCFStringEncodingMacVietnamese ;
2310 break ;
2311 case wxFONTENCODING_MACARABICEXT :
2312 enc = kCFStringEncodingMacExtArabic ;
2313 break ;
2314 case wxFONTENCODING_MACSYMBOL :
2315 enc = kCFStringEncodingMacSymbol ;
2316 break ;
2317 case wxFONTENCODING_MACDINGBATS :
2318 enc = kCFStringEncodingMacDingbats ;
2319 break ;
2320 case wxFONTENCODING_MACTURKISH :
2321 enc = kCFStringEncodingMacTurkish ;
2322 break ;
2323 case wxFONTENCODING_MACCROATIAN :
2324 enc = kCFStringEncodingMacCroatian ;
2325 break ;
2326 case wxFONTENCODING_MACICELANDIC :
2327 enc = kCFStringEncodingMacIcelandic ;
2328 break ;
2329 case wxFONTENCODING_MACROMANIAN :
2330 enc = kCFStringEncodingMacRomanian ;
2331 break ;
2332 case wxFONTENCODING_MACCELTIC :
2333 enc = kCFStringEncodingMacCeltic ;
2334 break ;
2335 case wxFONTENCODING_MACGAELIC :
2336 enc = kCFStringEncodingMacGaelic ;
2337 break ;
2338 // case wxFONTENCODING_MACKEYBOARD :
2339 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2340 // break ;
2341 default :
2342 // because gcc is picky
2343 break ;
2344 } ;
2345 return enc ;
2346 }
2347
2348 class wxMBConv_cocoa : public wxMBConv
2349 {
2350 public:
2351 wxMBConv_cocoa()
2352 {
2353 Init(CFStringGetSystemEncoding()) ;
2354 }
2355
2356 #if wxUSE_FONTMAP
2357 wxMBConv_cocoa(const wxChar* name)
2358 {
2359 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2360 }
2361 #endif
2362
2363 wxMBConv_cocoa(wxFontEncoding encoding)
2364 {
2365 Init( wxCFStringEncFromFontEnc(encoding) );
2366 }
2367
2368 ~wxMBConv_cocoa()
2369 {
2370 }
2371
2372 void Init( CFStringEncoding encoding)
2373 {
2374 m_encoding = encoding ;
2375 }
2376
2377 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2378 {
2379 wxASSERT(szUnConv);
2380
2381 CFStringRef theString = CFStringCreateWithBytes (
2382 NULL, //the allocator
2383 (const UInt8*)szUnConv,
2384 strlen(szUnConv),
2385 m_encoding,
2386 false //no BOM/external representation
2387 );
2388
2389 wxASSERT(theString);
2390
2391 size_t nOutLength = CFStringGetLength(theString);
2392
2393 if (szOut == NULL)
2394 {
2395 CFRelease(theString);
2396 return nOutLength;
2397 }
2398
2399 CFRange theRange = { 0, nOutSize };
2400
2401 #if SIZEOF_WCHAR_T == 4
2402 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2403 #endif
2404
2405 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2406
2407 CFRelease(theString);
2408
2409 szUniCharBuffer[nOutLength] = '\0' ;
2410
2411 #if SIZEOF_WCHAR_T == 4
2412 wxMBConvUTF16 converter ;
2413 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2414 delete[] szUniCharBuffer;
2415 #endif
2416
2417 return nOutLength;
2418 }
2419
2420 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2421 {
2422 wxASSERT(szUnConv);
2423
2424 size_t nRealOutSize;
2425 size_t nBufSize = wxWcslen(szUnConv);
2426 UniChar* szUniBuffer = (UniChar*) szUnConv;
2427
2428 #if SIZEOF_WCHAR_T == 4
2429 wxMBConvUTF16 converter ;
2430 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2431 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2432 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2433 nBufSize /= sizeof(UniChar);
2434 #endif
2435
2436 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2437 NULL, //allocator
2438 szUniBuffer,
2439 nBufSize,
2440 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2441 );
2442
2443 wxASSERT(theString);
2444
2445 //Note that CER puts a BOM when converting to unicode
2446 //so we check and use getchars instead in that case
2447 if (m_encoding == kCFStringEncodingUnicode)
2448 {
2449 if (szOut != NULL)
2450 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2451
2452 nRealOutSize = CFStringGetLength(theString) + 1;
2453 }
2454 else
2455 {
2456 CFStringGetBytes(
2457 theString,
2458 CFRangeMake(0, CFStringGetLength(theString)),
2459 m_encoding,
2460 0, //what to put in characters that can't be converted -
2461 //0 tells CFString to return NULL if it meets such a character
2462 false, //not an external representation
2463 (UInt8*) szOut,
2464 nOutSize,
2465 (CFIndex*) &nRealOutSize
2466 );
2467 }
2468
2469 CFRelease(theString);
2470
2471 #if SIZEOF_WCHAR_T == 4
2472 delete[] szUniBuffer;
2473 #endif
2474
2475 return nRealOutSize - 1;
2476 }
2477
2478 bool IsOk() const
2479 {
2480 return m_encoding != kCFStringEncodingInvalidId &&
2481 CFStringIsEncodingAvailable(m_encoding);
2482 }
2483
2484 private:
2485 CFStringEncoding m_encoding ;
2486 };
2487
2488 #endif // defined(__WXCOCOA__)
2489
2490 // ============================================================================
2491 // Mac conversion classes
2492 // ============================================================================
2493
2494 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2495
2496 class wxMBConv_mac : public wxMBConv
2497 {
2498 public:
2499 wxMBConv_mac()
2500 {
2501 Init(CFStringGetSystemEncoding()) ;
2502 }
2503
2504 #if wxUSE_FONTMAP
2505 wxMBConv_mac(const wxChar* name)
2506 {
2507 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2508 }
2509 #endif
2510
2511 wxMBConv_mac(wxFontEncoding encoding)
2512 {
2513 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2514 }
2515
2516 ~wxMBConv_mac()
2517 {
2518 OSStatus status = noErr ;
2519 status = TECDisposeConverter(m_MB2WC_converter);
2520 status = TECDisposeConverter(m_WC2MB_converter);
2521 }
2522
2523
2524 void Init( TextEncodingBase encoding)
2525 {
2526 OSStatus status = noErr ;
2527 m_char_encoding = encoding ;
2528 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2529
2530 status = TECCreateConverter(&m_MB2WC_converter,
2531 m_char_encoding,
2532 m_unicode_encoding);
2533 status = TECCreateConverter(&m_WC2MB_converter,
2534 m_unicode_encoding,
2535 m_char_encoding);
2536 }
2537
2538 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2539 {
2540 OSStatus status = noErr ;
2541 ByteCount byteOutLen ;
2542 ByteCount byteInLen = strlen(psz) ;
2543 wchar_t *tbuf = NULL ;
2544 UniChar* ubuf = NULL ;
2545 size_t res = 0 ;
2546
2547 if (buf == NULL)
2548 {
2549 //apple specs say at least 32
2550 n = wxMax( 32 , byteInLen ) ;
2551 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2552 }
2553 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2554 #if SIZEOF_WCHAR_T == 4
2555 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2556 #else
2557 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2558 #endif
2559 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2560 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2561 #if SIZEOF_WCHAR_T == 4
2562 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2563 // is not properly terminated we get random characters at the end
2564 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2565 wxMBConvUTF16 converter ;
2566 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2567 free( ubuf ) ;
2568 #else
2569 res = byteOutLen / sizeof( UniChar ) ;
2570 #endif
2571 if ( buf == NULL )
2572 free(tbuf) ;
2573
2574 if ( buf && res < n)
2575 buf[res] = 0;
2576
2577 return res ;
2578 }
2579
2580 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2581 {
2582 OSStatus status = noErr ;
2583 ByteCount byteOutLen ;
2584 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2585
2586 char *tbuf = NULL ;
2587
2588 if (buf == NULL)
2589 {
2590 //apple specs say at least 32
2591 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2592 tbuf = (char*) malloc( n ) ;
2593 }
2594
2595 ByteCount byteBufferLen = n ;
2596 UniChar* ubuf = NULL ;
2597 #if SIZEOF_WCHAR_T == 4
2598 wxMBConvUTF16 converter ;
2599 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2600 byteInLen = unicharlen ;
2601 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2602 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2603 #else
2604 ubuf = (UniChar*) psz ;
2605 #endif
2606 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2607 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2608 #if SIZEOF_WCHAR_T == 4
2609 free( ubuf ) ;
2610 #endif
2611 if ( buf == NULL )
2612 free(tbuf) ;
2613
2614 size_t res = byteOutLen ;
2615 if ( buf && res < n)
2616 {
2617 buf[res] = 0;
2618
2619 //we need to double-trip to verify it didn't insert any ? in place
2620 //of bogus characters
2621 wxWCharBuffer wcBuf(n);
2622 size_t pszlen = wxWcslen(psz);
2623 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2624 wxWcslen(wcBuf) != pszlen ||
2625 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2626 {
2627 // we didn't obtain the same thing we started from, hence
2628 // the conversion was lossy and we consider that it failed
2629 return (size_t)-1;
2630 }
2631 }
2632
2633 return res ;
2634 }
2635
2636 bool IsOk() const
2637 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2638
2639 private:
2640 TECObjectRef m_MB2WC_converter ;
2641 TECObjectRef m_WC2MB_converter ;
2642
2643 TextEncodingBase m_char_encoding ;
2644 TextEncodingBase m_unicode_encoding ;
2645 };
2646
2647 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2648
2649 // ============================================================================
2650 // wxEncodingConverter based conversion classes
2651 // ============================================================================
2652
2653 #if wxUSE_FONTMAP
2654
2655 class wxMBConv_wxwin : public wxMBConv
2656 {
2657 private:
2658 void Init()
2659 {
2660 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2661 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2662 }
2663
2664 public:
2665 // temporarily just use wxEncodingConverter stuff,
2666 // so that it works while a better implementation is built
2667 wxMBConv_wxwin(const wxChar* name)
2668 {
2669 if (name)
2670 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2671 else
2672 m_enc = wxFONTENCODING_SYSTEM;
2673
2674 Init();
2675 }
2676
2677 wxMBConv_wxwin(wxFontEncoding enc)
2678 {
2679 m_enc = enc;
2680
2681 Init();
2682 }
2683
2684 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2685 {
2686 size_t inbuf = strlen(psz);
2687 if (buf)
2688 {
2689 if (!m2w.Convert(psz,buf))
2690 return (size_t)-1;
2691 }
2692 return inbuf;
2693 }
2694
2695 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2696 {
2697 const size_t inbuf = wxWcslen(psz);
2698 if (buf)
2699 {
2700 if (!w2m.Convert(psz,buf))
2701 return (size_t)-1;
2702 }
2703
2704 return inbuf;
2705 }
2706
2707 virtual size_t GetMBNulLen() const
2708 {
2709 switch ( m_enc )
2710 {
2711 case wxFONTENCODING_UTF16BE:
2712 case wxFONTENCODING_UTF16LE:
2713 return 2;
2714
2715 case wxFONTENCODING_UTF32BE:
2716 case wxFONTENCODING_UTF32LE:
2717 return 4;
2718
2719 default:
2720 return 1;
2721 }
2722 }
2723
2724 bool IsOk() const { return m_ok; }
2725
2726 public:
2727 wxFontEncoding m_enc;
2728 wxEncodingConverter m2w, w2m;
2729
2730 private:
2731 // were we initialized successfully?
2732 bool m_ok;
2733
2734 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2735 };
2736
2737 // make the constructors available for unit testing
2738 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2739 {
2740 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2741 if ( !result->IsOk() )
2742 {
2743 delete result;
2744 return 0;
2745 }
2746 return result;
2747 }
2748
2749 #endif // wxUSE_FONTMAP
2750
2751 // ============================================================================
2752 // wxCSConv implementation
2753 // ============================================================================
2754
2755 void wxCSConv::Init()
2756 {
2757 m_name = NULL;
2758 m_convReal = NULL;
2759 m_deferred = true;
2760 }
2761
2762 wxCSConv::wxCSConv(const wxChar *charset)
2763 {
2764 Init();
2765
2766 if ( charset )
2767 {
2768 SetName(charset);
2769 }
2770
2771 #if wxUSE_FONTMAP
2772 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2773 #else
2774 m_encoding = wxFONTENCODING_SYSTEM;
2775 #endif
2776 }
2777
2778 wxCSConv::wxCSConv(wxFontEncoding encoding)
2779 {
2780 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2781 {
2782 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2783
2784 encoding = wxFONTENCODING_SYSTEM;
2785 }
2786
2787 Init();
2788
2789 m_encoding = encoding;
2790 }
2791
2792 wxCSConv::~wxCSConv()
2793 {
2794 Clear();
2795 }
2796
2797 wxCSConv::wxCSConv(const wxCSConv& conv)
2798 : wxMBConv()
2799 {
2800 Init();
2801
2802 SetName(conv.m_name);
2803 m_encoding = conv.m_encoding;
2804 }
2805
2806 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2807 {
2808 Clear();
2809
2810 SetName(conv.m_name);
2811 m_encoding = conv.m_encoding;
2812
2813 return *this;
2814 }
2815
2816 void wxCSConv::Clear()
2817 {
2818 free(m_name);
2819 delete m_convReal;
2820
2821 m_name = NULL;
2822 m_convReal = NULL;
2823 }
2824
2825 void wxCSConv::SetName(const wxChar *charset)
2826 {
2827 if (charset)
2828 {
2829 m_name = wxStrdup(charset);
2830 m_deferred = true;
2831 }
2832 }
2833
2834 #if wxUSE_FONTMAP
2835 #include "wx/hashmap.h"
2836
2837 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2838 wxEncodingNameCache );
2839
2840 static wxEncodingNameCache gs_nameCache;
2841 #endif
2842
2843 wxMBConv *wxCSConv::DoCreate() const
2844 {
2845 #if wxUSE_FONTMAP
2846 wxLogTrace(TRACE_STRCONV,
2847 wxT("creating conversion for %s"),
2848 (m_name ? m_name
2849 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2850 #endif // wxUSE_FONTMAP
2851
2852 // check for the special case of ASCII or ISO8859-1 charset: as we have
2853 // special knowledge of it anyhow, we don't need to create a special
2854 // conversion object
2855 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2856 m_encoding == wxFONTENCODING_DEFAULT )
2857 {
2858 // don't convert at all
2859 return NULL;
2860 }
2861
2862 // we trust OS to do conversion better than we can so try external
2863 // conversion methods first
2864 //
2865 // the full order is:
2866 // 1. OS conversion (iconv() under Unix or Win32 API)
2867 // 2. hard coded conversions for UTF
2868 // 3. wxEncodingConverter as fall back
2869
2870 // step (1)
2871 #ifdef HAVE_ICONV
2872 #if !wxUSE_FONTMAP
2873 if ( m_name )
2874 #endif // !wxUSE_FONTMAP
2875 {
2876 wxString name(m_name);
2877 wxFontEncoding encoding(m_encoding);
2878
2879 if ( !name.empty() )
2880 {
2881 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2882 if ( conv->IsOk() )
2883 return conv;
2884
2885 delete conv;
2886
2887 #if wxUSE_FONTMAP
2888 encoding =
2889 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2890 #endif // wxUSE_FONTMAP
2891 }
2892 #if wxUSE_FONTMAP
2893 {
2894 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2895 if ( it != gs_nameCache.end() )
2896 {
2897 if ( it->second.empty() )
2898 return NULL;
2899
2900 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2901 if ( conv->IsOk() )
2902 return conv;
2903
2904 delete conv;
2905 }
2906
2907 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2908
2909 for ( ; *names; ++names )
2910 {
2911 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2912 if ( conv->IsOk() )
2913 {
2914 gs_nameCache[encoding] = *names;
2915 return conv;
2916 }
2917
2918 delete conv;
2919 }
2920
2921 gs_nameCache[encoding] = _T(""); // cache the failure
2922 }
2923 #endif // wxUSE_FONTMAP
2924 }
2925 #endif // HAVE_ICONV
2926
2927 #ifdef wxHAVE_WIN32_MB2WC
2928 {
2929 #if wxUSE_FONTMAP
2930 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2931 : new wxMBConv_win32(m_encoding);
2932 if ( conv->IsOk() )
2933 return conv;
2934
2935 delete conv;
2936 #else
2937 return NULL;
2938 #endif
2939 }
2940 #endif // wxHAVE_WIN32_MB2WC
2941 #if defined(__WXMAC__)
2942 {
2943 // leave UTF16 and UTF32 to the built-ins of wx
2944 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2945 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2946 {
2947
2948 #if wxUSE_FONTMAP
2949 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2950 : new wxMBConv_mac(m_encoding);
2951 #else
2952 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2953 #endif
2954 if ( conv->IsOk() )
2955 return conv;
2956
2957 delete conv;
2958 }
2959 }
2960 #endif
2961 #if defined(__WXCOCOA__)
2962 {
2963 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2964 {
2965
2966 #if wxUSE_FONTMAP
2967 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2968 : new wxMBConv_cocoa(m_encoding);
2969 #else
2970 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2971 #endif
2972 if ( conv->IsOk() )
2973 return conv;
2974
2975 delete conv;
2976 }
2977 }
2978 #endif
2979 // step (2)
2980 wxFontEncoding enc = m_encoding;
2981 #if wxUSE_FONTMAP
2982 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2983 {
2984 // use "false" to suppress interactive dialogs -- we can be called from
2985 // anywhere and popping up a dialog from here is the last thing we want to
2986 // do
2987 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2988 }
2989 #endif // wxUSE_FONTMAP
2990
2991 switch ( enc )
2992 {
2993 case wxFONTENCODING_UTF7:
2994 return new wxMBConvUTF7;
2995
2996 case wxFONTENCODING_UTF8:
2997 return new wxMBConvUTF8;
2998
2999 case wxFONTENCODING_UTF16BE:
3000 return new wxMBConvUTF16BE;
3001
3002 case wxFONTENCODING_UTF16LE:
3003 return new wxMBConvUTF16LE;
3004
3005 case wxFONTENCODING_UTF32BE:
3006 return new wxMBConvUTF32BE;
3007
3008 case wxFONTENCODING_UTF32LE:
3009 return new wxMBConvUTF32LE;
3010
3011 default:
3012 // nothing to do but put here to suppress gcc warnings
3013 ;
3014 }
3015
3016 // step (3)
3017 #if wxUSE_FONTMAP
3018 {
3019 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3020 : new wxMBConv_wxwin(m_encoding);
3021 if ( conv->IsOk() )
3022 return conv;
3023
3024 delete conv;
3025 }
3026 #endif // wxUSE_FONTMAP
3027
3028 // NB: This is a hack to prevent deadlock. What could otherwise happen
3029 // in Unicode build: wxConvLocal creation ends up being here
3030 // because of some failure and logs the error. But wxLog will try to
3031 // attach timestamp, for which it will need wxConvLocal (to convert
3032 // time to char* and then wchar_t*), but that fails, tries to log
3033 // error, but wxLog has a (already locked) critical section that
3034 // guards static buffer.
3035 static bool alreadyLoggingError = false;
3036 if (!alreadyLoggingError)
3037 {
3038 alreadyLoggingError = true;
3039 wxLogError(_("Cannot convert from the charset '%s'!"),
3040 m_name ? m_name
3041 :
3042 #if wxUSE_FONTMAP
3043 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3044 #else // !wxUSE_FONTMAP
3045 wxString::Format(_("encoding %s"), m_encoding).c_str()
3046 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3047 );
3048 alreadyLoggingError = false;
3049 }
3050
3051 return NULL;
3052 }
3053
3054 void wxCSConv::CreateConvIfNeeded() const
3055 {
3056 if ( m_deferred )
3057 {
3058 wxCSConv *self = (wxCSConv *)this; // const_cast
3059
3060 #if wxUSE_INTL
3061 // if we don't have neither the name nor the encoding, use the default
3062 // encoding for this system
3063 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3064 {
3065 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3066 }
3067 #endif // wxUSE_INTL
3068
3069 self->m_convReal = DoCreate();
3070 self->m_deferred = false;
3071 }
3072 }
3073
3074 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3075 {
3076 CreateConvIfNeeded();
3077
3078 if (m_convReal)
3079 return m_convReal->MB2WC(buf, psz, n);
3080
3081 // latin-1 (direct)
3082 size_t len = strlen(psz);
3083
3084 if (buf)
3085 {
3086 for (size_t c = 0; c <= len; c++)
3087 buf[c] = (unsigned char)(psz[c]);
3088 }
3089
3090 return len;
3091 }
3092
3093 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3094 {
3095 CreateConvIfNeeded();
3096
3097 if (m_convReal)
3098 return m_convReal->WC2MB(buf, psz, n);
3099
3100 // latin-1 (direct)
3101 const size_t len = wxWcslen(psz);
3102 if (buf)
3103 {
3104 for (size_t c = 0; c <= len; c++)
3105 {
3106 if (psz[c] > 0xFF)
3107 return (size_t)-1;
3108 buf[c] = (char)psz[c];
3109 }
3110 }
3111 else
3112 {
3113 for (size_t c = 0; c <= len; c++)
3114 {
3115 if (psz[c] > 0xFF)
3116 return (size_t)-1;
3117 }
3118 }
3119
3120 return len;
3121 }
3122
3123 size_t wxCSConv::GetMBNulLen() const
3124 {
3125 CreateConvIfNeeded();
3126
3127 if ( m_convReal )
3128 {
3129 return m_convReal->GetMBNulLen();
3130 }
3131
3132 return 1;
3133 }
3134
3135 // ----------------------------------------------------------------------------
3136 // globals
3137 // ----------------------------------------------------------------------------
3138
3139 #ifdef __WINDOWS__
3140 static wxMBConv_win32 wxConvLibcObj;
3141 #elif defined(__WXMAC__) && !defined(__MACH__)
3142 static wxMBConv_mac wxConvLibcObj ;
3143 #else
3144 static wxMBConvLibc wxConvLibcObj;
3145 #endif
3146
3147 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3148 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3149 static wxMBConvUTF7 wxConvUTF7Obj;
3150 static wxMBConvUTF8 wxConvUTF8Obj;
3151
3152 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3153 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3154 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3155 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3156 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3157 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3158 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3159 #ifdef __WXOSX__
3160 wxConvUTF8Obj;
3161 #else
3162 wxConvLibcObj;
3163 #endif
3164
3165
3166 #else // !wxUSE_WCHAR_T
3167
3168 // stand-ins in absence of wchar_t
3169 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3170 wxConvISO8859_1,
3171 wxConvLocal,
3172 wxConvUTF8;
3173
3174 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T