]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
f610bf76e0a0b1116956b92d603fc3c7dc4e2d63
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
25
26 #ifdef __BORLANDC__
27 #pragma hdrstop
28 #endif
29
30 #ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33 #endif // WX_PRECOMP
34
35 #include "wx/strconv.h"
36
37 #if wxUSE_WCHAR_T
38
39 #ifdef __WINDOWS__
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #endif
43
44 #ifndef __WXWINCE__
45 #include <errno.h>
46 #endif
47
48 #include <ctype.h>
49 #include <string.h>
50 #include <stdlib.h>
51
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
55
56 #ifdef __SALFORDC__
57 #include <clib.h>
58 #endif
59
60 #ifdef HAVE_ICONV
61 #include <iconv.h>
62 #include "wx/thread.h"
63 #endif
64
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
67 #include "wx/utils.h"
68
69 #ifdef __WXMAC__
70 #ifndef __DARWIN__
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
74 #endif
75
76 #include "wx/mac/private.h" // includes mac headers
77 #endif
78
79 #define TRACE_STRCONV _T("strconv")
80
81 #if SIZEOF_WCHAR_T == 2
82 #define WC_UTF16
83 #endif
84
85 // ============================================================================
86 // implementation
87 // ============================================================================
88
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p, size_t n)
91 {
92 while ( n && *p++ == '\0' )
93 n--;
94
95 return n != 0;
96 }
97
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4
100 // ----------------------------------------------------------------------------
101
102
103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
104 {
105 if (input<=0xffff)
106 {
107 if (output)
108 *output = (wxUint16) input;
109 return 1;
110 }
111 else if (input>=0x110000)
112 {
113 return (size_t)-1;
114 }
115 else
116 {
117 if (output)
118 {
119 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
120 *output = (wxUint16) ((input&0x3ff)+0xdc00);
121 }
122 return 2;
123 }
124 }
125
126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
127 {
128 if ((*input<0xd800) || (*input>0xdfff))
129 {
130 output = *input;
131 return 1;
132 }
133 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
134 {
135 output = *input;
136 return (size_t)-1;
137 }
138 else
139 {
140 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
141 return 2;
142 }
143 }
144
145
146 // ----------------------------------------------------------------------------
147 // wxMBConv
148 // ----------------------------------------------------------------------------
149
150 size_t
151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
152 const char *src, size_t srcLen) const
153 {
154 // although new conversion classes are supposed to implement this function
155 // directly, the existins ones only implement the old MB2WC() and so, to
156 // avoid to have to rewrite all conversion classes at once, we provide a
157 // default (but not efficient) implementation of this one in terms of the
158 // old function by copying the input to ensure that it's NUL-terminated and
159 // then using MB2WC() to convert it
160
161 // the number of chars [which would be] written to dst [if it were not NULL]
162 size_t dstWritten = 0;
163
164 // the number of NULs terminating this string
165 size_t nulLen wxDUMMY_INITIALIZE(0);
166
167 // if we were not given the input size we just have to assume that the
168 // string is properly terminated as we have no way of knowing how long it
169 // is anyhow, but if we do have the size check whether there are enough
170 // NULs at the end
171 wxCharBuffer bufTmp;
172 const char *srcEnd;
173 if ( srcLen != (size_t)-1 )
174 {
175 // we need to know how to find the end of this string
176 nulLen = GetMBNulLen();
177 if ( nulLen == wxCONV_FAILED )
178 return wxCONV_FAILED;
179
180 // if there are enough NULs we can avoid the copy
181 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
182 {
183 // make a copy in order to properly NUL-terminate the string
184 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
185 char * const p = bufTmp.data();
186 memcpy(p, src, srcLen);
187 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
188 *s = '\0';
189
190 src = bufTmp;
191 }
192
193 srcEnd = src + srcLen;
194 }
195 else // quit after the first loop iteration
196 {
197 srcEnd = NULL;
198 }
199
200 for ( ;; )
201 {
202 // try to convert the current chunk
203 size_t lenChunk = MB2WC(NULL, src, 0);
204 if ( lenChunk == 0 )
205 {
206 // nothing left in the input string, conversion succeeded
207 break;
208 }
209
210 if ( lenChunk == wxCONV_FAILED )
211 return wxCONV_FAILED;
212
213 // if we already have a previous chunk, leave the NUL separating it
214 // from this one
215 if ( dstWritten )
216 {
217 dstWritten++;
218 if ( dst )
219 dst++;
220 }
221
222 dstWritten += lenChunk;
223
224 if ( dst )
225 {
226 if ( dstWritten > dstLen )
227 return wxCONV_FAILED;
228
229 lenChunk = MB2WC(dst, src, lenChunk + 1 /* for NUL */);
230 if ( lenChunk == wxCONV_FAILED )
231 return wxCONV_FAILED;
232
233 dst += lenChunk;
234 }
235
236 if ( !srcEnd )
237 {
238 // we convert the entire string in this cas, as we suppose that the
239 // string is NUL-terminated and so srcEnd is not used at all
240 break;
241 }
242
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src, nulLen) )
245 {
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
250 src += nulLen;
251 }
252
253 src += nulLen; // skipping over its terminator as well
254
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
258 if ( src >= srcEnd )
259 break;
260 }
261
262 return dstWritten;
263 }
264
265 size_t
266 wxMBConv::FromWChar(char *dst, size_t dstLen,
267 const wchar_t *src, size_t srcLen) const
268 {
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten = 0;
271
272 // make a copy of the input string unless it is already properly
273 // NUL-terminated
274 //
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp;
278 if ( srcLen == (size_t)-1 )
279 {
280 srcLen = wxWcslen(src) + 1;
281 }
282 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
283 {
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp = wxWCharBuffer(srcLen);
286 memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
287 src = bufTmp;
288 }
289
290 const size_t lenNul = GetMBNulLen();
291 for ( const wchar_t * const srcEnd = src + srcLen;
292 src < srcEnd;
293 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
294 {
295 // try to convert the current chunk
296 size_t lenChunk = WC2MB(NULL, src, 0);
297
298 if ( lenChunk == wxCONV_FAILED )
299 return wxCONV_FAILED;
300
301 lenChunk += lenNul;
302 dstWritten += lenChunk;
303
304 if ( dst )
305 {
306 if ( dstWritten > dstLen )
307 return wxCONV_FAILED;
308
309 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
310 return wxCONV_FAILED;
311
312 dst += lenChunk;
313 }
314 }
315
316 return dstWritten;
317 }
318
319 wxMBConv::~wxMBConv()
320 {
321 // nothing to do here (necessary for Darwin linking probably)
322 }
323
324 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
325 {
326 if ( psz )
327 {
328 // calculate the length of the buffer needed first
329 const size_t nLen = MB2WC(NULL, psz, 0);
330 if ( nLen != wxCONV_FAILED )
331 {
332 // now do the actual conversion
333 wxWCharBuffer buf(nLen /* +1 added implicitly */);
334
335 // +1 for the trailing NULL
336 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
337 return buf;
338 }
339 }
340
341 return wxWCharBuffer();
342 }
343
344 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
345 {
346 if ( pwz )
347 {
348 const size_t nLen = WC2MB(NULL, pwz, 0);
349 if ( nLen != wxCONV_FAILED )
350 {
351 // extra space for trailing NUL(s)
352 static const size_t extraLen = GetMaxMBNulLen();
353
354 wxCharBuffer buf(nLen + extraLen - 1);
355 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
356 return buf;
357 }
358 }
359
360 return wxCharBuffer();
361 }
362
363 const wxWCharBuffer
364 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
365 {
366 const size_t dstLen = ToWChar(NULL, 0, in, inLen);
367 if ( dstLen != wxCONV_FAILED )
368 {
369 wxWCharBuffer wbuf(dstLen);
370 if ( ToWChar(wbuf.data(), dstLen, in, inLen) )
371 {
372 if ( outLen )
373 *outLen = dstLen;
374 return wbuf;
375 }
376 }
377
378 if ( outLen )
379 *outLen = 0;
380
381 return wxWCharBuffer();
382 }
383
384 const wxCharBuffer
385 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
386 {
387 const size_t dstLen = FromWChar(NULL, 0, in, inLen);
388 if ( dstLen != wxCONV_FAILED )
389 {
390 wxCharBuffer buf(dstLen);
391 if ( FromWChar(buf.data(), dstLen, in, inLen) )
392 {
393 if ( outLen )
394 *outLen = dstLen;
395 return buf;
396 }
397 }
398
399 if ( outLen )
400 *outLen = 0;
401
402 return wxCharBuffer();
403 }
404
405 // ----------------------------------------------------------------------------
406 // wxMBConvLibc
407 // ----------------------------------------------------------------------------
408
409 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
410 {
411 return wxMB2WC(buf, psz, n);
412 }
413
414 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
415 {
416 return wxWC2MB(buf, psz, n);
417 }
418
419 // ----------------------------------------------------------------------------
420 // wxConvBrokenFileNames
421 // ----------------------------------------------------------------------------
422
423 #ifdef __UNIX__
424
425 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
426 {
427 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
428 || wxStricmp(charset, _T("UTF8")) == 0 )
429 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
430 else
431 m_conv = new wxCSConv(charset);
432 }
433
434 #endif // __UNIX__
435
436 // ----------------------------------------------------------------------------
437 // UTF-7
438 // ----------------------------------------------------------------------------
439
440 // Implementation (C) 2004 Fredrik Roubert
441
442 //
443 // BASE64 decoding table
444 //
445 static const unsigned char utf7unb64[] =
446 {
447 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
448 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
449 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
450 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
451 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
452 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
453 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
454 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
455 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
456 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
457 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
458 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
459 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
460 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
461 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
462 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
463 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
464 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
465 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
466 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
467 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
468 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
469 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
470 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
471 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
472 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
473 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
474 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
475 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
476 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
477 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
478 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
479 };
480
481 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
482 {
483 size_t len = 0;
484
485 while ( *psz && (!buf || (len < n)) )
486 {
487 unsigned char cc = *psz++;
488 if (cc != '+')
489 {
490 // plain ASCII char
491 if (buf)
492 *buf++ = cc;
493 len++;
494 }
495 else if (*psz == '-')
496 {
497 // encoded plus sign
498 if (buf)
499 *buf++ = cc;
500 len++;
501 psz++;
502 }
503 else // start of BASE64 encoded string
504 {
505 bool lsb, ok;
506 unsigned int d, l;
507 for ( ok = lsb = false, d = 0, l = 0;
508 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
509 psz++ )
510 {
511 d <<= 6;
512 d += cc;
513 for (l += 6; l >= 8; lsb = !lsb)
514 {
515 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
516 if (lsb)
517 {
518 if (buf)
519 *buf++ |= c;
520 len ++;
521 }
522 else
523 {
524 if (buf)
525 *buf = (wchar_t)(c << 8);
526 }
527
528 ok = true;
529 }
530 }
531
532 if ( !ok )
533 {
534 // in valid UTF7 we should have valid characters after '+'
535 return (size_t)-1;
536 }
537
538 if (*psz == '-')
539 psz++;
540 }
541 }
542
543 if ( buf && (len < n) )
544 *buf = '\0';
545
546 return len;
547 }
548
549 //
550 // BASE64 encoding table
551 //
552 static const unsigned char utf7enb64[] =
553 {
554 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
555 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
556 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
557 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
558 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
559 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
560 'w', 'x', 'y', 'z', '0', '1', '2', '3',
561 '4', '5', '6', '7', '8', '9', '+', '/'
562 };
563
564 //
565 // UTF-7 encoding table
566 //
567 // 0 - Set D (directly encoded characters)
568 // 1 - Set O (optional direct characters)
569 // 2 - whitespace characters (optional)
570 // 3 - special characters
571 //
572 static const unsigned char utf7encode[128] =
573 {
574 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
575 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
576 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
577 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
578 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
580 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
582 };
583
584 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
585 {
586 size_t len = 0;
587
588 while (*psz && ((!buf) || (len < n)))
589 {
590 wchar_t cc = *psz++;
591 if (cc < 0x80 && utf7encode[cc] < 1)
592 {
593 // plain ASCII char
594 if (buf)
595 *buf++ = (char)cc;
596 len++;
597 }
598 #ifndef WC_UTF16
599 else if (((wxUint32)cc) > 0xffff)
600 {
601 // no surrogate pair generation (yet?)
602 return (size_t)-1;
603 }
604 #endif
605 else
606 {
607 if (buf)
608 *buf++ = '+';
609 len++;
610 if (cc != '+')
611 {
612 // BASE64 encode string
613 unsigned int lsb, d, l;
614 for (d = 0, l = 0; /*nothing*/; psz++)
615 {
616 for (lsb = 0; lsb < 2; lsb ++)
617 {
618 d <<= 8;
619 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
620
621 for (l += 8; l >= 6; )
622 {
623 l -= 6;
624 if (buf)
625 *buf++ = utf7enb64[(d >> l) % 64];
626 len++;
627 }
628 }
629 cc = *psz;
630 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
631 break;
632 }
633 if (l != 0)
634 {
635 if (buf)
636 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
637 len++;
638 }
639 }
640 if (buf)
641 *buf++ = '-';
642 len++;
643 }
644 }
645 if (buf && (len < n))
646 *buf = 0;
647 return len;
648 }
649
650 // ----------------------------------------------------------------------------
651 // UTF-8
652 // ----------------------------------------------------------------------------
653
654 static wxUint32 utf8_max[]=
655 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
656
657 // boundaries of the private use area we use to (temporarily) remap invalid
658 // characters invalid in a UTF-8 encoded string
659 const wxUint32 wxUnicodePUA = 0x100000;
660 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
661
662 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
663 {
664 size_t len = 0;
665
666 while (*psz && ((!buf) || (len < n)))
667 {
668 const char *opsz = psz;
669 bool invalid = false;
670 unsigned char cc = *psz++, fc = cc;
671 unsigned cnt;
672 for (cnt = 0; fc & 0x80; cnt++)
673 fc <<= 1;
674 if (!cnt)
675 {
676 // plain ASCII char
677 if (buf)
678 *buf++ = cc;
679 len++;
680
681 // escape the escape character for octal escapes
682 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
683 && cc == '\\' && (!buf || len < n))
684 {
685 if (buf)
686 *buf++ = cc;
687 len++;
688 }
689 }
690 else
691 {
692 cnt--;
693 if (!cnt)
694 {
695 // invalid UTF-8 sequence
696 invalid = true;
697 }
698 else
699 {
700 unsigned ocnt = cnt - 1;
701 wxUint32 res = cc & (0x3f >> cnt);
702 while (cnt--)
703 {
704 cc = *psz;
705 if ((cc & 0xC0) != 0x80)
706 {
707 // invalid UTF-8 sequence
708 invalid = true;
709 break;
710 }
711 psz++;
712 res = (res << 6) | (cc & 0x3f);
713 }
714 if (invalid || res <= utf8_max[ocnt])
715 {
716 // illegal UTF-8 encoding
717 invalid = true;
718 }
719 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
720 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
721 {
722 // if one of our PUA characters turns up externally
723 // it must also be treated as an illegal sequence
724 // (a bit like you have to escape an escape character)
725 invalid = true;
726 }
727 else
728 {
729 #ifdef WC_UTF16
730 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
731 size_t pa = encode_utf16(res, (wxUint16 *)buf);
732 if (pa == (size_t)-1)
733 {
734 invalid = true;
735 }
736 else
737 {
738 if (buf)
739 buf += pa;
740 len += pa;
741 }
742 #else // !WC_UTF16
743 if (buf)
744 *buf++ = (wchar_t)res;
745 len++;
746 #endif // WC_UTF16/!WC_UTF16
747 }
748 }
749 if (invalid)
750 {
751 if (m_options & MAP_INVALID_UTF8_TO_PUA)
752 {
753 while (opsz < psz && (!buf || len < n))
754 {
755 #ifdef WC_UTF16
756 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
757 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
758 wxASSERT(pa != (size_t)-1);
759 if (buf)
760 buf += pa;
761 opsz++;
762 len += pa;
763 #else
764 if (buf)
765 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
766 opsz++;
767 len++;
768 #endif
769 }
770 }
771 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
772 {
773 while (opsz < psz && (!buf || len < n))
774 {
775 if ( buf && len + 3 < n )
776 {
777 unsigned char on = *opsz;
778 *buf++ = L'\\';
779 *buf++ = (wchar_t)( L'0' + on / 0100 );
780 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
781 *buf++ = (wchar_t)( L'0' + on % 010 );
782 }
783 opsz++;
784 len += 4;
785 }
786 }
787 else // MAP_INVALID_UTF8_NOT
788 {
789 return (size_t)-1;
790 }
791 }
792 }
793 }
794 if (buf && (len < n))
795 *buf = 0;
796 return len;
797 }
798
799 static inline bool isoctal(wchar_t wch)
800 {
801 return L'0' <= wch && wch <= L'7';
802 }
803
804 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
805 {
806 size_t len = 0;
807
808 while (*psz && ((!buf) || (len < n)))
809 {
810 wxUint32 cc;
811 #ifdef WC_UTF16
812 // cast is ok for WC_UTF16
813 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
814 psz += (pa == (size_t)-1) ? 1 : pa;
815 #else
816 cc=(*psz++) & 0x7fffffff;
817 #endif
818
819 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
820 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
821 {
822 if (buf)
823 *buf++ = (char)(cc - wxUnicodePUA);
824 len++;
825 }
826 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
827 && cc == L'\\' && psz[0] == L'\\' )
828 {
829 if (buf)
830 *buf++ = (char)cc;
831 psz++;
832 len++;
833 }
834 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
835 cc == L'\\' &&
836 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
837 {
838 if (buf)
839 {
840 *buf++ = (char) ((psz[0] - L'0')*0100 +
841 (psz[1] - L'0')*010 +
842 (psz[2] - L'0'));
843 }
844
845 psz += 3;
846 len++;
847 }
848 else
849 {
850 unsigned cnt;
851 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
852 if (!cnt)
853 {
854 // plain ASCII char
855 if (buf)
856 *buf++ = (char) cc;
857 len++;
858 }
859
860 else
861 {
862 len += cnt + 1;
863 if (buf)
864 {
865 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
866 while (cnt--)
867 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
868 }
869 }
870 }
871 }
872
873 if (buf && (len<n))
874 *buf = 0;
875
876 return len;
877 }
878
879 // ----------------------------------------------------------------------------
880 // UTF-16
881 // ----------------------------------------------------------------------------
882
883 #ifdef WORDS_BIGENDIAN
884 #define wxMBConvUTF16straight wxMBConvUTF16BE
885 #define wxMBConvUTF16swap wxMBConvUTF16LE
886 #else
887 #define wxMBConvUTF16swap wxMBConvUTF16BE
888 #define wxMBConvUTF16straight wxMBConvUTF16LE
889 #endif
890
891
892 #ifdef WC_UTF16
893
894 // copy 16bit MB to 16bit String
895 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
896 {
897 size_t len=0;
898
899 while (*(wxUint16*)psz && (!buf || len < n))
900 {
901 if (buf)
902 *buf++ = *(wxUint16*)psz;
903 len++;
904
905 psz += sizeof(wxUint16);
906 }
907 if (buf && len<n) *buf=0;
908
909 return len;
910 }
911
912
913 // copy 16bit String to 16bit MB
914 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
915 {
916 size_t len=0;
917
918 while (*psz && (!buf || len < n))
919 {
920 if (buf)
921 {
922 *(wxUint16*)buf = *psz;
923 buf += sizeof(wxUint16);
924 }
925 len += sizeof(wxUint16);
926 psz++;
927 }
928 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
929
930 return len;
931 }
932
933
934 // swap 16bit MB to 16bit String
935 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
936 {
937 size_t len = 0;
938
939 // UTF16 string must be terminated by 2 NULs as single NULs may occur
940 // inside the string
941 while ( (psz[0] || psz[1]) && (!buf || len < n) )
942 {
943 if ( buf )
944 {
945 ((char *)buf)[0] = psz[1];
946 ((char *)buf)[1] = psz[0];
947 buf++;
948 }
949 len++;
950 psz += 2;
951 }
952
953 if ( buf && len < n )
954 *buf = L'\0';
955
956 return len;
957 }
958
959
960 // swap 16bit MB to 16bit String
961 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
962 {
963 size_t len = 0;
964
965 while ( *psz && (!buf || len < n) )
966 {
967 if ( buf )
968 {
969 *buf++ = ((char*)psz)[1];
970 *buf++ = ((char*)psz)[0];
971 }
972 len += 2;
973 psz++;
974 }
975
976 if ( buf && len < n )
977 *buf = '\0';
978
979 return len;
980 }
981
982
983 #else // WC_UTF16
984
985
986 // copy 16bit MB to 32bit String
987 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
988 {
989 size_t len=0;
990
991 while (*(wxUint16*)psz && (!buf || len < n))
992 {
993 wxUint32 cc;
994 size_t pa=decode_utf16((wxUint16*)psz, cc);
995 if (pa == (size_t)-1)
996 return pa;
997
998 if (buf)
999 *buf++ = (wchar_t)cc;
1000 len++;
1001 psz += pa * sizeof(wxUint16);
1002 }
1003 if (buf && len<n) *buf=0;
1004
1005 return len;
1006 }
1007
1008
1009 // copy 32bit String to 16bit MB
1010 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1011 {
1012 size_t len=0;
1013
1014 while (*psz && (!buf || len < n))
1015 {
1016 wxUint16 cc[2];
1017 size_t pa=encode_utf16(*psz, cc);
1018
1019 if (pa == (size_t)-1)
1020 return pa;
1021
1022 if (buf)
1023 {
1024 *(wxUint16*)buf = cc[0];
1025 buf += sizeof(wxUint16);
1026 if (pa > 1)
1027 {
1028 *(wxUint16*)buf = cc[1];
1029 buf += sizeof(wxUint16);
1030 }
1031 }
1032
1033 len += pa*sizeof(wxUint16);
1034 psz++;
1035 }
1036 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1037
1038 return len;
1039 }
1040
1041
1042 // swap 16bit MB to 32bit String
1043 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1044 {
1045 size_t len=0;
1046
1047 while (*(wxUint16*)psz && (!buf || len < n))
1048 {
1049 wxUint32 cc;
1050 char tmp[4];
1051 tmp[0]=psz[1]; tmp[1]=psz[0];
1052 tmp[2]=psz[3]; tmp[3]=psz[2];
1053
1054 size_t pa=decode_utf16((wxUint16*)tmp, cc);
1055 if (pa == (size_t)-1)
1056 return pa;
1057
1058 if (buf)
1059 *buf++ = (wchar_t)cc;
1060
1061 len++;
1062 psz += pa * sizeof(wxUint16);
1063 }
1064 if (buf && len<n) *buf=0;
1065
1066 return len;
1067 }
1068
1069
1070 // swap 32bit String to 16bit MB
1071 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1072 {
1073 size_t len=0;
1074
1075 while (*psz && (!buf || len < n))
1076 {
1077 wxUint16 cc[2];
1078 size_t pa=encode_utf16(*psz, cc);
1079
1080 if (pa == (size_t)-1)
1081 return pa;
1082
1083 if (buf)
1084 {
1085 *buf++ = ((char*)cc)[1];
1086 *buf++ = ((char*)cc)[0];
1087 if (pa > 1)
1088 {
1089 *buf++ = ((char*)cc)[3];
1090 *buf++ = ((char*)cc)[2];
1091 }
1092 }
1093
1094 len += pa*sizeof(wxUint16);
1095 psz++;
1096 }
1097 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1098
1099 return len;
1100 }
1101
1102 #endif // WC_UTF16
1103
1104
1105 // ----------------------------------------------------------------------------
1106 // UTF-32
1107 // ----------------------------------------------------------------------------
1108
1109 #ifdef WORDS_BIGENDIAN
1110 #define wxMBConvUTF32straight wxMBConvUTF32BE
1111 #define wxMBConvUTF32swap wxMBConvUTF32LE
1112 #else
1113 #define wxMBConvUTF32swap wxMBConvUTF32BE
1114 #define wxMBConvUTF32straight wxMBConvUTF32LE
1115 #endif
1116
1117
1118 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1119 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1120
1121
1122 #ifdef WC_UTF16
1123
1124 // copy 32bit MB to 16bit String
1125 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1126 {
1127 size_t len=0;
1128
1129 while (*(wxUint32*)psz && (!buf || len < n))
1130 {
1131 wxUint16 cc[2];
1132
1133 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1134 if (pa == (size_t)-1)
1135 return pa;
1136
1137 if (buf)
1138 {
1139 *buf++ = cc[0];
1140 if (pa > 1)
1141 *buf++ = cc[1];
1142 }
1143 len += pa;
1144 psz += sizeof(wxUint32);
1145 }
1146 if (buf && len<n) *buf=0;
1147
1148 return len;
1149 }
1150
1151
1152 // copy 16bit String to 32bit MB
1153 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1154 {
1155 size_t len=0;
1156
1157 while (*psz && (!buf || len < n))
1158 {
1159 wxUint32 cc;
1160
1161 // cast is ok for WC_UTF16
1162 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1163 if (pa == (size_t)-1)
1164 return pa;
1165
1166 if (buf)
1167 {
1168 *(wxUint32*)buf = cc;
1169 buf += sizeof(wxUint32);
1170 }
1171 len += sizeof(wxUint32);
1172 psz += pa;
1173 }
1174
1175 if (buf && len<=n-sizeof(wxUint32))
1176 *(wxUint32*)buf=0;
1177
1178 return len;
1179 }
1180
1181
1182
1183 // swap 32bit MB to 16bit String
1184 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1185 {
1186 size_t len=0;
1187
1188 while (*(wxUint32*)psz && (!buf || len < n))
1189 {
1190 char tmp[4];
1191 tmp[0] = psz[3]; tmp[1] = psz[2];
1192 tmp[2] = psz[1]; tmp[3] = psz[0];
1193
1194
1195 wxUint16 cc[2];
1196
1197 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1198 if (pa == (size_t)-1)
1199 return pa;
1200
1201 if (buf)
1202 {
1203 *buf++ = cc[0];
1204 if (pa > 1)
1205 *buf++ = cc[1];
1206 }
1207 len += pa;
1208 psz += sizeof(wxUint32);
1209 }
1210
1211 if (buf && len<n)
1212 *buf=0;
1213
1214 return len;
1215 }
1216
1217
1218 // swap 16bit String to 32bit MB
1219 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1220 {
1221 size_t len=0;
1222
1223 while (*psz && (!buf || len < n))
1224 {
1225 char cc[4];
1226
1227 // cast is ok for WC_UTF16
1228 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1229 if (pa == (size_t)-1)
1230 return pa;
1231
1232 if (buf)
1233 {
1234 *buf++ = cc[3];
1235 *buf++ = cc[2];
1236 *buf++ = cc[1];
1237 *buf++ = cc[0];
1238 }
1239 len += sizeof(wxUint32);
1240 psz += pa;
1241 }
1242
1243 if (buf && len<=n-sizeof(wxUint32))
1244 *(wxUint32*)buf=0;
1245
1246 return len;
1247 }
1248
1249 #else // WC_UTF16
1250
1251
1252 // copy 32bit MB to 32bit String
1253 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1254 {
1255 size_t len=0;
1256
1257 while (*(wxUint32*)psz && (!buf || len < n))
1258 {
1259 if (buf)
1260 *buf++ = (wchar_t)(*(wxUint32*)psz);
1261 len++;
1262 psz += sizeof(wxUint32);
1263 }
1264
1265 if (buf && len<n)
1266 *buf=0;
1267
1268 return len;
1269 }
1270
1271
1272 // copy 32bit String to 32bit MB
1273 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1274 {
1275 size_t len=0;
1276
1277 while (*psz && (!buf || len < n))
1278 {
1279 if (buf)
1280 {
1281 *(wxUint32*)buf = *psz;
1282 buf += sizeof(wxUint32);
1283 }
1284
1285 len += sizeof(wxUint32);
1286 psz++;
1287 }
1288
1289 if (buf && len<=n-sizeof(wxUint32))
1290 *(wxUint32*)buf=0;
1291
1292 return len;
1293 }
1294
1295
1296 // swap 32bit MB to 32bit String
1297 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1298 {
1299 size_t len=0;
1300
1301 while (*(wxUint32*)psz && (!buf || len < n))
1302 {
1303 if (buf)
1304 {
1305 ((char *)buf)[0] = psz[3];
1306 ((char *)buf)[1] = psz[2];
1307 ((char *)buf)[2] = psz[1];
1308 ((char *)buf)[3] = psz[0];
1309 buf++;
1310 }
1311 len++;
1312 psz += sizeof(wxUint32);
1313 }
1314
1315 if (buf && len<n)
1316 *buf=0;
1317
1318 return len;
1319 }
1320
1321
1322 // swap 32bit String to 32bit MB
1323 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1324 {
1325 size_t len=0;
1326
1327 while (*psz && (!buf || len < n))
1328 {
1329 if (buf)
1330 {
1331 *buf++ = ((char *)psz)[3];
1332 *buf++ = ((char *)psz)[2];
1333 *buf++ = ((char *)psz)[1];
1334 *buf++ = ((char *)psz)[0];
1335 }
1336 len += sizeof(wxUint32);
1337 psz++;
1338 }
1339
1340 if (buf && len<=n-sizeof(wxUint32))
1341 *(wxUint32*)buf=0;
1342
1343 return len;
1344 }
1345
1346
1347 #endif // WC_UTF16
1348
1349
1350 // ============================================================================
1351 // The classes doing conversion using the iconv_xxx() functions
1352 // ============================================================================
1353
1354 #ifdef HAVE_ICONV
1355
1356 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1357 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1358 // (unless there's yet another bug in glibc) the only case when iconv()
1359 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1360 // left in the input buffer -- when _real_ error occurs,
1361 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1362 // iconv() failure.
1363 // [This bug does not appear in glibc 2.2.]
1364 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1365 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1366 (errno != E2BIG || bufLeft != 0))
1367 #else
1368 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1369 #endif
1370
1371 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1372
1373 #define ICONV_T_INVALID ((iconv_t)-1)
1374
1375 #if SIZEOF_WCHAR_T == 4
1376 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1377 #define WC_ENC wxFONTENCODING_UTF32
1378 #elif SIZEOF_WCHAR_T == 2
1379 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1380 #define WC_ENC wxFONTENCODING_UTF16
1381 #else // sizeof(wchar_t) != 2 nor 4
1382 // does this ever happen?
1383 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1384 #endif
1385
1386 // ----------------------------------------------------------------------------
1387 // wxMBConv_iconv: encapsulates an iconv character set
1388 // ----------------------------------------------------------------------------
1389
1390 class wxMBConv_iconv : public wxMBConv
1391 {
1392 public:
1393 wxMBConv_iconv(const wxChar *name);
1394 virtual ~wxMBConv_iconv();
1395
1396 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1397 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1398
1399 // classify this encoding as explained in wxMBConv::GetMBNulLen()
1400 // comment
1401 virtual size_t GetMBNulLen() const;
1402
1403 bool IsOk() const
1404 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1405
1406 protected:
1407 // the iconv handlers used to translate from multibyte to wide char and in
1408 // the other direction
1409 iconv_t m2w,
1410 w2m;
1411 #if wxUSE_THREADS
1412 // guards access to m2w and w2m objects
1413 wxMutex m_iconvMutex;
1414 #endif
1415
1416 private:
1417 // the name (for iconv_open()) of a wide char charset -- if none is
1418 // available on this machine, it will remain NULL
1419 static wxString ms_wcCharsetName;
1420
1421 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1422 // different endian-ness than the native one
1423 static bool ms_wcNeedsSwap;
1424
1425 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1426 // initially
1427 size_t m_minMBCharWidth;
1428 };
1429
1430 // make the constructor available for unit testing
1431 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1432 {
1433 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1434 if ( !result->IsOk() )
1435 {
1436 delete result;
1437 return 0;
1438 }
1439 return result;
1440 }
1441
1442 wxString wxMBConv_iconv::ms_wcCharsetName;
1443 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1444
1445 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1446 {
1447 m_minMBCharWidth = 0;
1448
1449 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1450 // names for the charsets
1451 const wxCharBuffer cname(wxString(name).ToAscii());
1452
1453 // check for charset that represents wchar_t:
1454 if ( ms_wcCharsetName.empty() )
1455 {
1456 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1457
1458 #if wxUSE_FONTMAP
1459 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1460 #else // !wxUSE_FONTMAP
1461 static const wxChar *names[] =
1462 {
1463 #if SIZEOF_WCHAR_T == 4
1464 _T("UCS-4"),
1465 #elif SIZEOF_WCHAR_T = 2
1466 _T("UCS-2"),
1467 #endif
1468 NULL
1469 };
1470 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1471
1472 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1473 {
1474 const wxString nameCS(*names);
1475
1476 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1477 wxString nameXE(nameCS);
1478 #ifdef WORDS_BIGENDIAN
1479 nameXE += _T("BE");
1480 #else // little endian
1481 nameXE += _T("LE");
1482 #endif
1483
1484 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1485 nameXE.c_str());
1486
1487 m2w = iconv_open(nameXE.ToAscii(), cname);
1488 if ( m2w == ICONV_T_INVALID )
1489 {
1490 // try charset w/o bytesex info (e.g. "UCS4")
1491 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1492 nameCS.c_str());
1493 m2w = iconv_open(nameCS.ToAscii(), cname);
1494
1495 // and check for bytesex ourselves:
1496 if ( m2w != ICONV_T_INVALID )
1497 {
1498 char buf[2], *bufPtr;
1499 wchar_t wbuf[2], *wbufPtr;
1500 size_t insz, outsz;
1501 size_t res;
1502
1503 buf[0] = 'A';
1504 buf[1] = 0;
1505 wbuf[0] = 0;
1506 insz = 2;
1507 outsz = SIZEOF_WCHAR_T * 2;
1508 wbufPtr = wbuf;
1509 bufPtr = buf;
1510
1511 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1512 (char**)&wbufPtr, &outsz);
1513
1514 if (ICONV_FAILED(res, insz))
1515 {
1516 wxLogLastError(wxT("iconv"));
1517 wxLogError(_("Conversion to charset '%s' doesn't work."),
1518 nameCS.c_str());
1519 }
1520 else // ok, can convert to this encoding, remember it
1521 {
1522 ms_wcCharsetName = nameCS;
1523 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1524 }
1525 }
1526 }
1527 else // use charset not requiring byte swapping
1528 {
1529 ms_wcCharsetName = nameXE;
1530 }
1531 }
1532
1533 wxLogTrace(TRACE_STRCONV,
1534 wxT("iconv wchar_t charset is \"%s\"%s"),
1535 ms_wcCharsetName.empty() ? _T("<none>")
1536 : ms_wcCharsetName.c_str(),
1537 ms_wcNeedsSwap ? _T(" (needs swap)")
1538 : _T(""));
1539 }
1540 else // we already have ms_wcCharsetName
1541 {
1542 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1543 }
1544
1545 if ( ms_wcCharsetName.empty() )
1546 {
1547 w2m = ICONV_T_INVALID;
1548 }
1549 else
1550 {
1551 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1552 if ( w2m == ICONV_T_INVALID )
1553 {
1554 wxLogTrace(TRACE_STRCONV,
1555 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1556 ms_wcCharsetName.c_str(), cname.data());
1557 }
1558 }
1559 }
1560
1561 wxMBConv_iconv::~wxMBConv_iconv()
1562 {
1563 if ( m2w != ICONV_T_INVALID )
1564 iconv_close(m2w);
1565 if ( w2m != ICONV_T_INVALID )
1566 iconv_close(w2m);
1567 }
1568
1569 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1570 {
1571 // find the string length: notice that must be done differently for
1572 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1573 size_t inbuf;
1574 const size_t nulLen = GetMBNulLen();
1575 switch ( nulLen )
1576 {
1577 default:
1578 return (size_t)-1;
1579
1580 case 1:
1581 inbuf = strlen(psz); // arguably more optimized than our version
1582 break;
1583
1584 case 2:
1585 case 4:
1586 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1587 // they also have to start at character boundary and not span two
1588 // adjacent characters
1589 const char *p;
1590 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1591 ;
1592 inbuf = p - psz;
1593 break;
1594 }
1595
1596 #if wxUSE_THREADS
1597 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1598 // Unfortunately there is a couple of global wxCSConv objects such as
1599 // wxConvLocal that are used all over wx code, so we have to make sure
1600 // the handle is used by at most one thread at the time. Otherwise
1601 // only a few wx classes would be safe to use from non-main threads
1602 // as MB<->WC conversion would fail "randomly".
1603 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1604 #endif // wxUSE_THREADS
1605
1606
1607 size_t outbuf = n * SIZEOF_WCHAR_T;
1608 size_t res, cres;
1609 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1610 wchar_t *bufPtr = buf;
1611 const char *pszPtr = psz;
1612
1613 if (buf)
1614 {
1615 // have destination buffer, convert there
1616 cres = iconv(m2w,
1617 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1618 (char**)&bufPtr, &outbuf);
1619 res = n - (outbuf / SIZEOF_WCHAR_T);
1620
1621 if (ms_wcNeedsSwap)
1622 {
1623 // convert to native endianness
1624 for ( unsigned i = 0; i < res; i++ )
1625 buf[n] = WC_BSWAP(buf[i]);
1626 }
1627
1628 // NUL-terminate the string if there is any space left
1629 if (res < n)
1630 buf[res] = 0;
1631 }
1632 else
1633 {
1634 // no destination buffer... convert using temp buffer
1635 // to calculate destination buffer requirement
1636 wchar_t tbuf[8];
1637 res = 0;
1638 do {
1639 bufPtr = tbuf;
1640 outbuf = 8*SIZEOF_WCHAR_T;
1641
1642 cres = iconv(m2w,
1643 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1644 (char**)&bufPtr, &outbuf );
1645
1646 res += 8-(outbuf/SIZEOF_WCHAR_T);
1647 } while ((cres==(size_t)-1) && (errno==E2BIG));
1648 }
1649
1650 if (ICONV_FAILED(cres, inbuf))
1651 {
1652 //VS: it is ok if iconv fails, hence trace only
1653 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1654 return (size_t)-1;
1655 }
1656
1657 return res;
1658 }
1659
1660 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1661 {
1662 #if wxUSE_THREADS
1663 // NB: explained in MB2WC
1664 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1665 #endif
1666
1667 size_t inlen = wxWcslen(psz);
1668 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1669 size_t outbuf = n;
1670 size_t res, cres;
1671
1672 wchar_t *tmpbuf = 0;
1673
1674 if (ms_wcNeedsSwap)
1675 {
1676 // need to copy to temp buffer to switch endianness
1677 // (doing WC_BSWAP twice on the original buffer won't help, as it
1678 // could be in read-only memory, or be accessed in some other thread)
1679 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1680 for ( size_t i = 0; i < inlen; i++ )
1681 tmpbuf[n] = WC_BSWAP(psz[i]);
1682 tmpbuf[inlen] = L'\0';
1683 psz = tmpbuf;
1684 }
1685
1686 if (buf)
1687 {
1688 // have destination buffer, convert there
1689 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1690
1691 res = n-outbuf;
1692
1693 // NB: iconv was given only wcslen(psz) characters on input, and so
1694 // it couldn't convert the trailing zero. Let's do it ourselves
1695 // if there's some room left for it in the output buffer.
1696 if (res < n)
1697 buf[0] = 0;
1698 }
1699 else
1700 {
1701 // no destination buffer... convert using temp buffer
1702 // to calculate destination buffer requirement
1703 char tbuf[16];
1704 res = 0;
1705 do {
1706 buf = tbuf; outbuf = 16;
1707
1708 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1709
1710 res += 16 - outbuf;
1711 } while ((cres==(size_t)-1) && (errno==E2BIG));
1712 }
1713
1714 if (ms_wcNeedsSwap)
1715 {
1716 free(tmpbuf);
1717 }
1718
1719 if (ICONV_FAILED(cres, inbuf))
1720 {
1721 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1722 return (size_t)-1;
1723 }
1724
1725 return res;
1726 }
1727
1728 size_t wxMBConv_iconv::GetMBNulLen() const
1729 {
1730 if ( m_minMBCharWidth == 0 )
1731 {
1732 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1733
1734 #if wxUSE_THREADS
1735 // NB: explained in MB2WC
1736 wxMutexLocker lock(self->m_iconvMutex);
1737 #endif
1738
1739 wchar_t *wnul = L"";
1740 char buf[8]; // should be enough for NUL in any encoding
1741 size_t inLen = sizeof(wchar_t),
1742 outLen = WXSIZEOF(buf);
1743 char *in = (char *)wnul;
1744 char *out = buf;
1745 if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1746 {
1747 self->m_minMBCharWidth = (size_t)-1;
1748 }
1749 else // ok
1750 {
1751 self->m_minMBCharWidth = out - buf;
1752 }
1753 }
1754
1755 return m_minMBCharWidth;
1756 }
1757
1758 #endif // HAVE_ICONV
1759
1760
1761 // ============================================================================
1762 // Win32 conversion classes
1763 // ============================================================================
1764
1765 #ifdef wxHAVE_WIN32_MB2WC
1766
1767 // from utils.cpp
1768 #if wxUSE_FONTMAP
1769 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1770 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1771 #endif
1772
1773 class wxMBConv_win32 : public wxMBConv
1774 {
1775 public:
1776 wxMBConv_win32()
1777 {
1778 m_CodePage = CP_ACP;
1779 m_minMBCharWidth = 0;
1780 }
1781
1782 #if wxUSE_FONTMAP
1783 wxMBConv_win32(const wxChar* name)
1784 {
1785 m_CodePage = wxCharsetToCodepage(name);
1786 m_minMBCharWidth = 0;
1787 }
1788
1789 wxMBConv_win32(wxFontEncoding encoding)
1790 {
1791 m_CodePage = wxEncodingToCodepage(encoding);
1792 m_minMBCharWidth = 0;
1793 }
1794 #endif // wxUSE_FONTMAP
1795
1796 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1797 {
1798 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1799 // the behaviour is not compatible with the Unix version (using iconv)
1800 // and break the library itself, e.g. wxTextInputStream::NextChar()
1801 // wouldn't work if reading an incomplete MB char didn't result in an
1802 // error
1803 //
1804 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1805 // an error (tested under Windows Server 2003) and apparently it is
1806 // done on purpose, i.e. the function accepts any input in this case
1807 // and although I'd prefer to return error on ill-formed output, our
1808 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1809 // explicitly ill-formed according to RFC 2152) neither so we don't
1810 // even have any fallback here...
1811 //
1812 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1813 // Win XP or newer and if it is specified on older versions, conversion
1814 // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1815 // fails. So we can only use the flag on newer Windows versions.
1816 // Additionally, the flag is not supported by UTF7, symbol and CJK
1817 // encodings. See here:
1818 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1819 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1820 int flags = 0;
1821 if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1822 m_CodePage < 50000 &&
1823 IsAtLeastWin2kSP4() )
1824 {
1825 flags = MB_ERR_INVALID_CHARS;
1826 }
1827 else if ( m_CodePage == CP_UTF8 )
1828 {
1829 // Avoid round-trip in the special case of UTF-8 by using our
1830 // own UTF-8 conversion code:
1831 return wxMBConvUTF8().MB2WC(buf, psz, n);
1832 }
1833
1834 const size_t len = ::MultiByteToWideChar
1835 (
1836 m_CodePage, // code page
1837 flags, // flags: fall on error
1838 psz, // input string
1839 -1, // its length (NUL-terminated)
1840 buf, // output string
1841 buf ? n : 0 // size of output buffer
1842 );
1843 if ( !len )
1844 {
1845 // function totally failed
1846 return (size_t)-1;
1847 }
1848
1849 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1850 // check if we succeeded, by doing a double trip:
1851 if ( !flags && buf )
1852 {
1853 const size_t mbLen = strlen(psz);
1854 wxCharBuffer mbBuf(mbLen);
1855 if ( ::WideCharToMultiByte
1856 (
1857 m_CodePage,
1858 0,
1859 buf,
1860 -1,
1861 mbBuf.data(),
1862 mbLen + 1, // size in bytes, not length
1863 NULL,
1864 NULL
1865 ) == 0 ||
1866 strcmp(mbBuf, psz) != 0 )
1867 {
1868 // we didn't obtain the same thing we started from, hence
1869 // the conversion was lossy and we consider that it failed
1870 return (size_t)-1;
1871 }
1872 }
1873
1874 // note that it returns count of written chars for buf != NULL and size
1875 // of the needed buffer for buf == NULL so in either case the length of
1876 // the string (which never includes the terminating NUL) is one less
1877 return len - 1;
1878 }
1879
1880 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1881 {
1882 /*
1883 we have a problem here: by default, WideCharToMultiByte() may
1884 replace characters unrepresentable in the target code page with bad
1885 quality approximations such as turning "1/2" symbol (U+00BD) into
1886 "1" for the code pages which don't have it and we, obviously, want
1887 to avoid this at any price
1888
1889 the trouble is that this function does it _silently_, i.e. it won't
1890 even tell us whether it did or not... Win98/2000 and higher provide
1891 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1892 we have to resort to a round trip, i.e. check that converting back
1893 results in the same string -- this is, of course, expensive but
1894 otherwise we simply can't be sure to not garble the data.
1895 */
1896
1897 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1898 // it doesn't work with CJK encodings (which we test for rather roughly
1899 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1900 // supporting it
1901 BOOL usedDef wxDUMMY_INITIALIZE(false);
1902 BOOL *pUsedDef;
1903 int flags;
1904 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1905 {
1906 // it's our lucky day
1907 flags = WC_NO_BEST_FIT_CHARS;
1908 pUsedDef = &usedDef;
1909 }
1910 else // old system or unsupported encoding
1911 {
1912 flags = 0;
1913 pUsedDef = NULL;
1914 }
1915
1916 const size_t len = ::WideCharToMultiByte
1917 (
1918 m_CodePage, // code page
1919 flags, // either none or no best fit
1920 pwz, // input string
1921 -1, // it is (wide) NUL-terminated
1922 buf, // output buffer
1923 buf ? n : 0, // and its size
1924 NULL, // default "replacement" char
1925 pUsedDef // [out] was it used?
1926 );
1927
1928 if ( !len )
1929 {
1930 // function totally failed
1931 return (size_t)-1;
1932 }
1933
1934 // if we were really converting, check if we succeeded
1935 if ( buf )
1936 {
1937 if ( flags )
1938 {
1939 // check if the conversion failed, i.e. if any replacements
1940 // were done
1941 if ( usedDef )
1942 return (size_t)-1;
1943 }
1944 else // we must resort to double tripping...
1945 {
1946 wxWCharBuffer wcBuf(n);
1947 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1948 wcscmp(wcBuf, pwz) != 0 )
1949 {
1950 // we didn't obtain the same thing we started from, hence
1951 // the conversion was lossy and we consider that it failed
1952 return (size_t)-1;
1953 }
1954 }
1955 }
1956
1957 // see the comment above for the reason of "len - 1"
1958 return len - 1;
1959 }
1960
1961 virtual size_t GetMBNulLen() const
1962 {
1963 if ( m_minMBCharWidth == 0 )
1964 {
1965 int len = ::WideCharToMultiByte
1966 (
1967 m_CodePage, // code page
1968 0, // no flags
1969 L"", // input string
1970 1, // translate just the NUL
1971 NULL, // output buffer
1972 0, // and its size
1973 NULL, // no replacement char
1974 NULL // [out] don't care if it was used
1975 );
1976
1977 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1978 switch ( len )
1979 {
1980 default:
1981 wxLogDebug(_T("Unexpected NUL length %d"), len);
1982 // fall through
1983
1984 case 0:
1985 self->m_minMBCharWidth = (size_t)-1;
1986 break;
1987
1988 case 1:
1989 case 2:
1990 case 4:
1991 self->m_minMBCharWidth = len;
1992 break;
1993 }
1994 }
1995
1996 return m_minMBCharWidth;
1997 }
1998
1999 bool IsOk() const { return m_CodePage != -1; }
2000
2001 private:
2002 static bool CanUseNoBestFit()
2003 {
2004 static int s_isWin98Or2k = -1;
2005
2006 if ( s_isWin98Or2k == -1 )
2007 {
2008 int verMaj, verMin;
2009 switch ( wxGetOsVersion(&verMaj, &verMin) )
2010 {
2011 case wxWIN95:
2012 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2013 break;
2014
2015 case wxWINDOWS_NT:
2016 s_isWin98Or2k = verMaj >= 5;
2017 break;
2018
2019 default:
2020 // unknown, be conseravtive by default
2021 s_isWin98Or2k = 0;
2022 }
2023
2024 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2025 }
2026
2027 return s_isWin98Or2k == 1;
2028 }
2029
2030 static bool IsAtLeastWin2kSP4()
2031 {
2032 #ifdef __WXWINCE__
2033 return false;
2034 #else
2035 static int s_isAtLeastWin2kSP4 = -1;
2036
2037 if ( s_isAtLeastWin2kSP4 == -1 )
2038 {
2039 OSVERSIONINFOEX ver;
2040
2041 memset(&ver, 0, sizeof(ver));
2042 ver.dwOSVersionInfoSize = sizeof(ver);
2043 GetVersionEx((OSVERSIONINFO*)&ver);
2044
2045 s_isAtLeastWin2kSP4 =
2046 ((ver.dwMajorVersion > 5) || // Vista+
2047 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2048 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2049 ver.wServicePackMajor >= 4)) // 2000 SP4+
2050 ? 1 : 0;
2051 }
2052
2053 return s_isAtLeastWin2kSP4 == 1;
2054 #endif
2055 }
2056
2057
2058 // the code page we're working with
2059 long m_CodePage;
2060
2061 // cached result of GetMBNulLen(), set to 0 initially meaning
2062 // "unknown"
2063 size_t m_minMBCharWidth;
2064 };
2065
2066 #endif // wxHAVE_WIN32_MB2WC
2067
2068 // ============================================================================
2069 // Cocoa conversion classes
2070 // ============================================================================
2071
2072 #if defined(__WXCOCOA__)
2073
2074 // RN: There is no UTF-32 support in either Core Foundation or
2075 // Cocoa. Strangely enough, internally Core Foundation uses
2076 // UTF 32 internally quite a bit - its just not public (yet).
2077
2078 #include <CoreFoundation/CFString.h>
2079 #include <CoreFoundation/CFStringEncodingExt.h>
2080
2081 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2082 {
2083 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2084 if ( encoding == wxFONTENCODING_DEFAULT )
2085 {
2086 enc = CFStringGetSystemEncoding();
2087 }
2088 else switch( encoding)
2089 {
2090 case wxFONTENCODING_ISO8859_1 :
2091 enc = kCFStringEncodingISOLatin1 ;
2092 break ;
2093 case wxFONTENCODING_ISO8859_2 :
2094 enc = kCFStringEncodingISOLatin2;
2095 break ;
2096 case wxFONTENCODING_ISO8859_3 :
2097 enc = kCFStringEncodingISOLatin3 ;
2098 break ;
2099 case wxFONTENCODING_ISO8859_4 :
2100 enc = kCFStringEncodingISOLatin4;
2101 break ;
2102 case wxFONTENCODING_ISO8859_5 :
2103 enc = kCFStringEncodingISOLatinCyrillic;
2104 break ;
2105 case wxFONTENCODING_ISO8859_6 :
2106 enc = kCFStringEncodingISOLatinArabic;
2107 break ;
2108 case wxFONTENCODING_ISO8859_7 :
2109 enc = kCFStringEncodingISOLatinGreek;
2110 break ;
2111 case wxFONTENCODING_ISO8859_8 :
2112 enc = kCFStringEncodingISOLatinHebrew;
2113 break ;
2114 case wxFONTENCODING_ISO8859_9 :
2115 enc = kCFStringEncodingISOLatin5;
2116 break ;
2117 case wxFONTENCODING_ISO8859_10 :
2118 enc = kCFStringEncodingISOLatin6;
2119 break ;
2120 case wxFONTENCODING_ISO8859_11 :
2121 enc = kCFStringEncodingISOLatinThai;
2122 break ;
2123 case wxFONTENCODING_ISO8859_13 :
2124 enc = kCFStringEncodingISOLatin7;
2125 break ;
2126 case wxFONTENCODING_ISO8859_14 :
2127 enc = kCFStringEncodingISOLatin8;
2128 break ;
2129 case wxFONTENCODING_ISO8859_15 :
2130 enc = kCFStringEncodingISOLatin9;
2131 break ;
2132
2133 case wxFONTENCODING_KOI8 :
2134 enc = kCFStringEncodingKOI8_R;
2135 break ;
2136 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2137 enc = kCFStringEncodingDOSRussian;
2138 break ;
2139
2140 // case wxFONTENCODING_BULGARIAN :
2141 // enc = ;
2142 // break ;
2143
2144 case wxFONTENCODING_CP437 :
2145 enc =kCFStringEncodingDOSLatinUS ;
2146 break ;
2147 case wxFONTENCODING_CP850 :
2148 enc = kCFStringEncodingDOSLatin1;
2149 break ;
2150 case wxFONTENCODING_CP852 :
2151 enc = kCFStringEncodingDOSLatin2;
2152 break ;
2153 case wxFONTENCODING_CP855 :
2154 enc = kCFStringEncodingDOSCyrillic;
2155 break ;
2156 case wxFONTENCODING_CP866 :
2157 enc =kCFStringEncodingDOSRussian ;
2158 break ;
2159 case wxFONTENCODING_CP874 :
2160 enc = kCFStringEncodingDOSThai;
2161 break ;
2162 case wxFONTENCODING_CP932 :
2163 enc = kCFStringEncodingDOSJapanese;
2164 break ;
2165 case wxFONTENCODING_CP936 :
2166 enc =kCFStringEncodingDOSChineseSimplif ;
2167 break ;
2168 case wxFONTENCODING_CP949 :
2169 enc = kCFStringEncodingDOSKorean;
2170 break ;
2171 case wxFONTENCODING_CP950 :
2172 enc = kCFStringEncodingDOSChineseTrad;
2173 break ;
2174 case wxFONTENCODING_CP1250 :
2175 enc = kCFStringEncodingWindowsLatin2;
2176 break ;
2177 case wxFONTENCODING_CP1251 :
2178 enc =kCFStringEncodingWindowsCyrillic ;
2179 break ;
2180 case wxFONTENCODING_CP1252 :
2181 enc =kCFStringEncodingWindowsLatin1 ;
2182 break ;
2183 case wxFONTENCODING_CP1253 :
2184 enc = kCFStringEncodingWindowsGreek;
2185 break ;
2186 case wxFONTENCODING_CP1254 :
2187 enc = kCFStringEncodingWindowsLatin5;
2188 break ;
2189 case wxFONTENCODING_CP1255 :
2190 enc =kCFStringEncodingWindowsHebrew ;
2191 break ;
2192 case wxFONTENCODING_CP1256 :
2193 enc =kCFStringEncodingWindowsArabic ;
2194 break ;
2195 case wxFONTENCODING_CP1257 :
2196 enc = kCFStringEncodingWindowsBalticRim;
2197 break ;
2198 // This only really encodes to UTF7 (if that) evidently
2199 // case wxFONTENCODING_UTF7 :
2200 // enc = kCFStringEncodingNonLossyASCII ;
2201 // break ;
2202 case wxFONTENCODING_UTF8 :
2203 enc = kCFStringEncodingUTF8 ;
2204 break ;
2205 case wxFONTENCODING_EUC_JP :
2206 enc = kCFStringEncodingEUC_JP;
2207 break ;
2208 case wxFONTENCODING_UTF16 :
2209 enc = kCFStringEncodingUnicode ;
2210 break ;
2211 case wxFONTENCODING_MACROMAN :
2212 enc = kCFStringEncodingMacRoman ;
2213 break ;
2214 case wxFONTENCODING_MACJAPANESE :
2215 enc = kCFStringEncodingMacJapanese ;
2216 break ;
2217 case wxFONTENCODING_MACCHINESETRAD :
2218 enc = kCFStringEncodingMacChineseTrad ;
2219 break ;
2220 case wxFONTENCODING_MACKOREAN :
2221 enc = kCFStringEncodingMacKorean ;
2222 break ;
2223 case wxFONTENCODING_MACARABIC :
2224 enc = kCFStringEncodingMacArabic ;
2225 break ;
2226 case wxFONTENCODING_MACHEBREW :
2227 enc = kCFStringEncodingMacHebrew ;
2228 break ;
2229 case wxFONTENCODING_MACGREEK :
2230 enc = kCFStringEncodingMacGreek ;
2231 break ;
2232 case wxFONTENCODING_MACCYRILLIC :
2233 enc = kCFStringEncodingMacCyrillic ;
2234 break ;
2235 case wxFONTENCODING_MACDEVANAGARI :
2236 enc = kCFStringEncodingMacDevanagari ;
2237 break ;
2238 case wxFONTENCODING_MACGURMUKHI :
2239 enc = kCFStringEncodingMacGurmukhi ;
2240 break ;
2241 case wxFONTENCODING_MACGUJARATI :
2242 enc = kCFStringEncodingMacGujarati ;
2243 break ;
2244 case wxFONTENCODING_MACORIYA :
2245 enc = kCFStringEncodingMacOriya ;
2246 break ;
2247 case wxFONTENCODING_MACBENGALI :
2248 enc = kCFStringEncodingMacBengali ;
2249 break ;
2250 case wxFONTENCODING_MACTAMIL :
2251 enc = kCFStringEncodingMacTamil ;
2252 break ;
2253 case wxFONTENCODING_MACTELUGU :
2254 enc = kCFStringEncodingMacTelugu ;
2255 break ;
2256 case wxFONTENCODING_MACKANNADA :
2257 enc = kCFStringEncodingMacKannada ;
2258 break ;
2259 case wxFONTENCODING_MACMALAJALAM :
2260 enc = kCFStringEncodingMacMalayalam ;
2261 break ;
2262 case wxFONTENCODING_MACSINHALESE :
2263 enc = kCFStringEncodingMacSinhalese ;
2264 break ;
2265 case wxFONTENCODING_MACBURMESE :
2266 enc = kCFStringEncodingMacBurmese ;
2267 break ;
2268 case wxFONTENCODING_MACKHMER :
2269 enc = kCFStringEncodingMacKhmer ;
2270 break ;
2271 case wxFONTENCODING_MACTHAI :
2272 enc = kCFStringEncodingMacThai ;
2273 break ;
2274 case wxFONTENCODING_MACLAOTIAN :
2275 enc = kCFStringEncodingMacLaotian ;
2276 break ;
2277 case wxFONTENCODING_MACGEORGIAN :
2278 enc = kCFStringEncodingMacGeorgian ;
2279 break ;
2280 case wxFONTENCODING_MACARMENIAN :
2281 enc = kCFStringEncodingMacArmenian ;
2282 break ;
2283 case wxFONTENCODING_MACCHINESESIMP :
2284 enc = kCFStringEncodingMacChineseSimp ;
2285 break ;
2286 case wxFONTENCODING_MACTIBETAN :
2287 enc = kCFStringEncodingMacTibetan ;
2288 break ;
2289 case wxFONTENCODING_MACMONGOLIAN :
2290 enc = kCFStringEncodingMacMongolian ;
2291 break ;
2292 case wxFONTENCODING_MACETHIOPIC :
2293 enc = kCFStringEncodingMacEthiopic ;
2294 break ;
2295 case wxFONTENCODING_MACCENTRALEUR :
2296 enc = kCFStringEncodingMacCentralEurRoman ;
2297 break ;
2298 case wxFONTENCODING_MACVIATNAMESE :
2299 enc = kCFStringEncodingMacVietnamese ;
2300 break ;
2301 case wxFONTENCODING_MACARABICEXT :
2302 enc = kCFStringEncodingMacExtArabic ;
2303 break ;
2304 case wxFONTENCODING_MACSYMBOL :
2305 enc = kCFStringEncodingMacSymbol ;
2306 break ;
2307 case wxFONTENCODING_MACDINGBATS :
2308 enc = kCFStringEncodingMacDingbats ;
2309 break ;
2310 case wxFONTENCODING_MACTURKISH :
2311 enc = kCFStringEncodingMacTurkish ;
2312 break ;
2313 case wxFONTENCODING_MACCROATIAN :
2314 enc = kCFStringEncodingMacCroatian ;
2315 break ;
2316 case wxFONTENCODING_MACICELANDIC :
2317 enc = kCFStringEncodingMacIcelandic ;
2318 break ;
2319 case wxFONTENCODING_MACROMANIAN :
2320 enc = kCFStringEncodingMacRomanian ;
2321 break ;
2322 case wxFONTENCODING_MACCELTIC :
2323 enc = kCFStringEncodingMacCeltic ;
2324 break ;
2325 case wxFONTENCODING_MACGAELIC :
2326 enc = kCFStringEncodingMacGaelic ;
2327 break ;
2328 // case wxFONTENCODING_MACKEYBOARD :
2329 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2330 // break ;
2331 default :
2332 // because gcc is picky
2333 break ;
2334 } ;
2335 return enc ;
2336 }
2337
2338 class wxMBConv_cocoa : public wxMBConv
2339 {
2340 public:
2341 wxMBConv_cocoa()
2342 {
2343 Init(CFStringGetSystemEncoding()) ;
2344 }
2345
2346 #if wxUSE_FONTMAP
2347 wxMBConv_cocoa(const wxChar* name)
2348 {
2349 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2350 }
2351 #endif
2352
2353 wxMBConv_cocoa(wxFontEncoding encoding)
2354 {
2355 Init( wxCFStringEncFromFontEnc(encoding) );
2356 }
2357
2358 ~wxMBConv_cocoa()
2359 {
2360 }
2361
2362 void Init( CFStringEncoding encoding)
2363 {
2364 m_encoding = encoding ;
2365 }
2366
2367 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2368 {
2369 wxASSERT(szUnConv);
2370
2371 CFStringRef theString = CFStringCreateWithBytes (
2372 NULL, //the allocator
2373 (const UInt8*)szUnConv,
2374 strlen(szUnConv),
2375 m_encoding,
2376 false //no BOM/external representation
2377 );
2378
2379 wxASSERT(theString);
2380
2381 size_t nOutLength = CFStringGetLength(theString);
2382
2383 if (szOut == NULL)
2384 {
2385 CFRelease(theString);
2386 return nOutLength;
2387 }
2388
2389 CFRange theRange = { 0, nOutSize };
2390
2391 #if SIZEOF_WCHAR_T == 4
2392 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2393 #endif
2394
2395 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2396
2397 CFRelease(theString);
2398
2399 szUniCharBuffer[nOutLength] = '\0' ;
2400
2401 #if SIZEOF_WCHAR_T == 4
2402 wxMBConvUTF16 converter ;
2403 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2404 delete[] szUniCharBuffer;
2405 #endif
2406
2407 return nOutLength;
2408 }
2409
2410 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2411 {
2412 wxASSERT(szUnConv);
2413
2414 size_t nRealOutSize;
2415 size_t nBufSize = wxWcslen(szUnConv);
2416 UniChar* szUniBuffer = (UniChar*) szUnConv;
2417
2418 #if SIZEOF_WCHAR_T == 4
2419 wxMBConvUTF16 converter ;
2420 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2421 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2422 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2423 nBufSize /= sizeof(UniChar);
2424 #endif
2425
2426 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2427 NULL, //allocator
2428 szUniBuffer,
2429 nBufSize,
2430 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2431 );
2432
2433 wxASSERT(theString);
2434
2435 //Note that CER puts a BOM when converting to unicode
2436 //so we check and use getchars instead in that case
2437 if (m_encoding == kCFStringEncodingUnicode)
2438 {
2439 if (szOut != NULL)
2440 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2441
2442 nRealOutSize = CFStringGetLength(theString) + 1;
2443 }
2444 else
2445 {
2446 CFStringGetBytes(
2447 theString,
2448 CFRangeMake(0, CFStringGetLength(theString)),
2449 m_encoding,
2450 0, //what to put in characters that can't be converted -
2451 //0 tells CFString to return NULL if it meets such a character
2452 false, //not an external representation
2453 (UInt8*) szOut,
2454 nOutSize,
2455 (CFIndex*) &nRealOutSize
2456 );
2457 }
2458
2459 CFRelease(theString);
2460
2461 #if SIZEOF_WCHAR_T == 4
2462 delete[] szUniBuffer;
2463 #endif
2464
2465 return nRealOutSize - 1;
2466 }
2467
2468 bool IsOk() const
2469 {
2470 return m_encoding != kCFStringEncodingInvalidId &&
2471 CFStringIsEncodingAvailable(m_encoding);
2472 }
2473
2474 private:
2475 CFStringEncoding m_encoding ;
2476 };
2477
2478 #endif // defined(__WXCOCOA__)
2479
2480 // ============================================================================
2481 // Mac conversion classes
2482 // ============================================================================
2483
2484 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2485
2486 class wxMBConv_mac : public wxMBConv
2487 {
2488 public:
2489 wxMBConv_mac()
2490 {
2491 Init(CFStringGetSystemEncoding()) ;
2492 }
2493
2494 #if wxUSE_FONTMAP
2495 wxMBConv_mac(const wxChar* name)
2496 {
2497 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2498 }
2499 #endif
2500
2501 wxMBConv_mac(wxFontEncoding encoding)
2502 {
2503 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2504 }
2505
2506 ~wxMBConv_mac()
2507 {
2508 OSStatus status = noErr ;
2509 status = TECDisposeConverter(m_MB2WC_converter);
2510 status = TECDisposeConverter(m_WC2MB_converter);
2511 }
2512
2513
2514 void Init( TextEncodingBase encoding)
2515 {
2516 OSStatus status = noErr ;
2517 m_char_encoding = encoding ;
2518 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2519
2520 status = TECCreateConverter(&m_MB2WC_converter,
2521 m_char_encoding,
2522 m_unicode_encoding);
2523 status = TECCreateConverter(&m_WC2MB_converter,
2524 m_unicode_encoding,
2525 m_char_encoding);
2526 }
2527
2528 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2529 {
2530 OSStatus status = noErr ;
2531 ByteCount byteOutLen ;
2532 ByteCount byteInLen = strlen(psz) ;
2533 wchar_t *tbuf = NULL ;
2534 UniChar* ubuf = NULL ;
2535 size_t res = 0 ;
2536
2537 if (buf == NULL)
2538 {
2539 //apple specs say at least 32
2540 n = wxMax( 32 , byteInLen ) ;
2541 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2542 }
2543 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2544 #if SIZEOF_WCHAR_T == 4
2545 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2546 #else
2547 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2548 #endif
2549 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2550 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2551 #if SIZEOF_WCHAR_T == 4
2552 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2553 // is not properly terminated we get random characters at the end
2554 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2555 wxMBConvUTF16 converter ;
2556 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2557 free( ubuf ) ;
2558 #else
2559 res = byteOutLen / sizeof( UniChar ) ;
2560 #endif
2561 if ( buf == NULL )
2562 free(tbuf) ;
2563
2564 if ( buf && res < n)
2565 buf[res] = 0;
2566
2567 return res ;
2568 }
2569
2570 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2571 {
2572 OSStatus status = noErr ;
2573 ByteCount byteOutLen ;
2574 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2575
2576 char *tbuf = NULL ;
2577
2578 if (buf == NULL)
2579 {
2580 //apple specs say at least 32
2581 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2582 tbuf = (char*) malloc( n ) ;
2583 }
2584
2585 ByteCount byteBufferLen = n ;
2586 UniChar* ubuf = NULL ;
2587 #if SIZEOF_WCHAR_T == 4
2588 wxMBConvUTF16 converter ;
2589 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2590 byteInLen = unicharlen ;
2591 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2592 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2593 #else
2594 ubuf = (UniChar*) psz ;
2595 #endif
2596 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2597 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2598 #if SIZEOF_WCHAR_T == 4
2599 free( ubuf ) ;
2600 #endif
2601 if ( buf == NULL )
2602 free(tbuf) ;
2603
2604 size_t res = byteOutLen ;
2605 if ( buf && res < n)
2606 {
2607 buf[res] = 0;
2608
2609 //we need to double-trip to verify it didn't insert any ? in place
2610 //of bogus characters
2611 wxWCharBuffer wcBuf(n);
2612 size_t pszlen = wxWcslen(psz);
2613 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2614 wxWcslen(wcBuf) != pszlen ||
2615 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2616 {
2617 // we didn't obtain the same thing we started from, hence
2618 // the conversion was lossy and we consider that it failed
2619 return (size_t)-1;
2620 }
2621 }
2622
2623 return res ;
2624 }
2625
2626 bool IsOk() const
2627 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2628
2629 private:
2630 TECObjectRef m_MB2WC_converter ;
2631 TECObjectRef m_WC2MB_converter ;
2632
2633 TextEncodingBase m_char_encoding ;
2634 TextEncodingBase m_unicode_encoding ;
2635 };
2636
2637 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2638
2639 // ============================================================================
2640 // wxEncodingConverter based conversion classes
2641 // ============================================================================
2642
2643 #if wxUSE_FONTMAP
2644
2645 class wxMBConv_wxwin : public wxMBConv
2646 {
2647 private:
2648 void Init()
2649 {
2650 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2651 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2652 }
2653
2654 public:
2655 // temporarily just use wxEncodingConverter stuff,
2656 // so that it works while a better implementation is built
2657 wxMBConv_wxwin(const wxChar* name)
2658 {
2659 if (name)
2660 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2661 else
2662 m_enc = wxFONTENCODING_SYSTEM;
2663
2664 Init();
2665 }
2666
2667 wxMBConv_wxwin(wxFontEncoding enc)
2668 {
2669 m_enc = enc;
2670
2671 Init();
2672 }
2673
2674 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2675 {
2676 size_t inbuf = strlen(psz);
2677 if (buf)
2678 {
2679 if (!m2w.Convert(psz,buf))
2680 return (size_t)-1;
2681 }
2682 return inbuf;
2683 }
2684
2685 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2686 {
2687 const size_t inbuf = wxWcslen(psz);
2688 if (buf)
2689 {
2690 if (!w2m.Convert(psz,buf))
2691 return (size_t)-1;
2692 }
2693
2694 return inbuf;
2695 }
2696
2697 virtual size_t GetMBNulLen() const
2698 {
2699 switch ( m_enc )
2700 {
2701 case wxFONTENCODING_UTF16BE:
2702 case wxFONTENCODING_UTF16LE:
2703 return 2;
2704
2705 case wxFONTENCODING_UTF32BE:
2706 case wxFONTENCODING_UTF32LE:
2707 return 4;
2708
2709 default:
2710 return 1;
2711 }
2712 }
2713
2714 bool IsOk() const { return m_ok; }
2715
2716 public:
2717 wxFontEncoding m_enc;
2718 wxEncodingConverter m2w, w2m;
2719
2720 private:
2721 // were we initialized successfully?
2722 bool m_ok;
2723
2724 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2725 };
2726
2727 // make the constructors available for unit testing
2728 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2729 {
2730 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2731 if ( !result->IsOk() )
2732 {
2733 delete result;
2734 return 0;
2735 }
2736 return result;
2737 }
2738
2739 #endif // wxUSE_FONTMAP
2740
2741 // ============================================================================
2742 // wxCSConv implementation
2743 // ============================================================================
2744
2745 void wxCSConv::Init()
2746 {
2747 m_name = NULL;
2748 m_convReal = NULL;
2749 m_deferred = true;
2750 }
2751
2752 wxCSConv::wxCSConv(const wxChar *charset)
2753 {
2754 Init();
2755
2756 if ( charset )
2757 {
2758 SetName(charset);
2759 }
2760
2761 #if wxUSE_FONTMAP
2762 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2763 #else
2764 m_encoding = wxFONTENCODING_SYSTEM;
2765 #endif
2766 }
2767
2768 wxCSConv::wxCSConv(wxFontEncoding encoding)
2769 {
2770 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2771 {
2772 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2773
2774 encoding = wxFONTENCODING_SYSTEM;
2775 }
2776
2777 Init();
2778
2779 m_encoding = encoding;
2780 }
2781
2782 wxCSConv::~wxCSConv()
2783 {
2784 Clear();
2785 }
2786
2787 wxCSConv::wxCSConv(const wxCSConv& conv)
2788 : wxMBConv()
2789 {
2790 Init();
2791
2792 SetName(conv.m_name);
2793 m_encoding = conv.m_encoding;
2794 }
2795
2796 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2797 {
2798 Clear();
2799
2800 SetName(conv.m_name);
2801 m_encoding = conv.m_encoding;
2802
2803 return *this;
2804 }
2805
2806 void wxCSConv::Clear()
2807 {
2808 free(m_name);
2809 delete m_convReal;
2810
2811 m_name = NULL;
2812 m_convReal = NULL;
2813 }
2814
2815 void wxCSConv::SetName(const wxChar *charset)
2816 {
2817 if (charset)
2818 {
2819 m_name = wxStrdup(charset);
2820 m_deferred = true;
2821 }
2822 }
2823
2824 #if wxUSE_FONTMAP
2825 #include "wx/hashmap.h"
2826
2827 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2828 wxEncodingNameCache );
2829
2830 static wxEncodingNameCache gs_nameCache;
2831 #endif
2832
2833 wxMBConv *wxCSConv::DoCreate() const
2834 {
2835 #if wxUSE_FONTMAP
2836 wxLogTrace(TRACE_STRCONV,
2837 wxT("creating conversion for %s"),
2838 (m_name ? m_name
2839 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2840 #endif // wxUSE_FONTMAP
2841
2842 // check for the special case of ASCII or ISO8859-1 charset: as we have
2843 // special knowledge of it anyhow, we don't need to create a special
2844 // conversion object
2845 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2846 m_encoding == wxFONTENCODING_DEFAULT )
2847 {
2848 // don't convert at all
2849 return NULL;
2850 }
2851
2852 // we trust OS to do conversion better than we can so try external
2853 // conversion methods first
2854 //
2855 // the full order is:
2856 // 1. OS conversion (iconv() under Unix or Win32 API)
2857 // 2. hard coded conversions for UTF
2858 // 3. wxEncodingConverter as fall back
2859
2860 // step (1)
2861 #ifdef HAVE_ICONV
2862 #if !wxUSE_FONTMAP
2863 if ( m_name )
2864 #endif // !wxUSE_FONTMAP
2865 {
2866 wxString name(m_name);
2867 wxFontEncoding encoding(m_encoding);
2868
2869 if ( !name.empty() )
2870 {
2871 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2872 if ( conv->IsOk() )
2873 return conv;
2874
2875 delete conv;
2876
2877 #if wxUSE_FONTMAP
2878 encoding =
2879 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2880 #endif // wxUSE_FONTMAP
2881 }
2882 #if wxUSE_FONTMAP
2883 {
2884 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2885 if ( it != gs_nameCache.end() )
2886 {
2887 if ( it->second.empty() )
2888 return NULL;
2889
2890 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2891 if ( conv->IsOk() )
2892 return conv;
2893
2894 delete conv;
2895 }
2896
2897 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2898
2899 for ( ; *names; ++names )
2900 {
2901 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2902 if ( conv->IsOk() )
2903 {
2904 gs_nameCache[encoding] = *names;
2905 return conv;
2906 }
2907
2908 delete conv;
2909 }
2910
2911 gs_nameCache[encoding] = _T(""); // cache the failure
2912 }
2913 #endif // wxUSE_FONTMAP
2914 }
2915 #endif // HAVE_ICONV
2916
2917 #ifdef wxHAVE_WIN32_MB2WC
2918 {
2919 #if wxUSE_FONTMAP
2920 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2921 : new wxMBConv_win32(m_encoding);
2922 if ( conv->IsOk() )
2923 return conv;
2924
2925 delete conv;
2926 #else
2927 return NULL;
2928 #endif
2929 }
2930 #endif // wxHAVE_WIN32_MB2WC
2931 #if defined(__WXMAC__)
2932 {
2933 // leave UTF16 and UTF32 to the built-ins of wx
2934 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2935 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2936 {
2937
2938 #if wxUSE_FONTMAP
2939 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2940 : new wxMBConv_mac(m_encoding);
2941 #else
2942 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2943 #endif
2944 if ( conv->IsOk() )
2945 return conv;
2946
2947 delete conv;
2948 }
2949 }
2950 #endif
2951 #if defined(__WXCOCOA__)
2952 {
2953 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2954 {
2955
2956 #if wxUSE_FONTMAP
2957 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2958 : new wxMBConv_cocoa(m_encoding);
2959 #else
2960 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2961 #endif
2962 if ( conv->IsOk() )
2963 return conv;
2964
2965 delete conv;
2966 }
2967 }
2968 #endif
2969 // step (2)
2970 wxFontEncoding enc = m_encoding;
2971 #if wxUSE_FONTMAP
2972 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2973 {
2974 // use "false" to suppress interactive dialogs -- we can be called from
2975 // anywhere and popping up a dialog from here is the last thing we want to
2976 // do
2977 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2978 }
2979 #endif // wxUSE_FONTMAP
2980
2981 switch ( enc )
2982 {
2983 case wxFONTENCODING_UTF7:
2984 return new wxMBConvUTF7;
2985
2986 case wxFONTENCODING_UTF8:
2987 return new wxMBConvUTF8;
2988
2989 case wxFONTENCODING_UTF16BE:
2990 return new wxMBConvUTF16BE;
2991
2992 case wxFONTENCODING_UTF16LE:
2993 return new wxMBConvUTF16LE;
2994
2995 case wxFONTENCODING_UTF32BE:
2996 return new wxMBConvUTF32BE;
2997
2998 case wxFONTENCODING_UTF32LE:
2999 return new wxMBConvUTF32LE;
3000
3001 default:
3002 // nothing to do but put here to suppress gcc warnings
3003 ;
3004 }
3005
3006 // step (3)
3007 #if wxUSE_FONTMAP
3008 {
3009 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3010 : new wxMBConv_wxwin(m_encoding);
3011 if ( conv->IsOk() )
3012 return conv;
3013
3014 delete conv;
3015 }
3016 #endif // wxUSE_FONTMAP
3017
3018 // NB: This is a hack to prevent deadlock. What could otherwise happen
3019 // in Unicode build: wxConvLocal creation ends up being here
3020 // because of some failure and logs the error. But wxLog will try to
3021 // attach timestamp, for which it will need wxConvLocal (to convert
3022 // time to char* and then wchar_t*), but that fails, tries to log
3023 // error, but wxLog has a (already locked) critical section that
3024 // guards static buffer.
3025 static bool alreadyLoggingError = false;
3026 if (!alreadyLoggingError)
3027 {
3028 alreadyLoggingError = true;
3029 wxLogError(_("Cannot convert from the charset '%s'!"),
3030 m_name ? m_name
3031 :
3032 #if wxUSE_FONTMAP
3033 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3034 #else // !wxUSE_FONTMAP
3035 wxString::Format(_("encoding %s"), m_encoding).c_str()
3036 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3037 );
3038 alreadyLoggingError = false;
3039 }
3040
3041 return NULL;
3042 }
3043
3044 void wxCSConv::CreateConvIfNeeded() const
3045 {
3046 if ( m_deferred )
3047 {
3048 wxCSConv *self = (wxCSConv *)this; // const_cast
3049
3050 #if wxUSE_INTL
3051 // if we don't have neither the name nor the encoding, use the default
3052 // encoding for this system
3053 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3054 {
3055 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3056 }
3057 #endif // wxUSE_INTL
3058
3059 self->m_convReal = DoCreate();
3060 self->m_deferred = false;
3061 }
3062 }
3063
3064 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3065 {
3066 CreateConvIfNeeded();
3067
3068 if (m_convReal)
3069 return m_convReal->MB2WC(buf, psz, n);
3070
3071 // latin-1 (direct)
3072 size_t len = strlen(psz);
3073
3074 if (buf)
3075 {
3076 for (size_t c = 0; c <= len; c++)
3077 buf[c] = (unsigned char)(psz[c]);
3078 }
3079
3080 return len;
3081 }
3082
3083 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3084 {
3085 CreateConvIfNeeded();
3086
3087 if (m_convReal)
3088 return m_convReal->WC2MB(buf, psz, n);
3089
3090 // latin-1 (direct)
3091 const size_t len = wxWcslen(psz);
3092 if (buf)
3093 {
3094 for (size_t c = 0; c <= len; c++)
3095 {
3096 if (psz[c] > 0xFF)
3097 return (size_t)-1;
3098 buf[c] = (char)psz[c];
3099 }
3100 }
3101 else
3102 {
3103 for (size_t c = 0; c <= len; c++)
3104 {
3105 if (psz[c] > 0xFF)
3106 return (size_t)-1;
3107 }
3108 }
3109
3110 return len;
3111 }
3112
3113 size_t wxCSConv::GetMBNulLen() const
3114 {
3115 CreateConvIfNeeded();
3116
3117 if ( m_convReal )
3118 {
3119 return m_convReal->GetMBNulLen();
3120 }
3121
3122 return 1;
3123 }
3124
3125 // ----------------------------------------------------------------------------
3126 // globals
3127 // ----------------------------------------------------------------------------
3128
3129 #ifdef __WINDOWS__
3130 static wxMBConv_win32 wxConvLibcObj;
3131 #elif defined(__WXMAC__) && !defined(__MACH__)
3132 static wxMBConv_mac wxConvLibcObj ;
3133 #else
3134 static wxMBConvLibc wxConvLibcObj;
3135 #endif
3136
3137 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3138 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3139 static wxMBConvUTF7 wxConvUTF7Obj;
3140 static wxMBConvUTF8 wxConvUTF8Obj;
3141
3142 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3143 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3144 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3145 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3146 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3147 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3148 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3149 #ifdef __WXOSX__
3150 wxConvUTF8Obj;
3151 #else
3152 wxConvLibcObj;
3153 #endif
3154
3155
3156 #else // !wxUSE_WCHAR_T
3157
3158 // stand-ins in absence of wchar_t
3159 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3160 wxConvISO8859_1,
3161 wxConvLocal,
3162 wxConvUTF8;
3163
3164 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T