]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
91483e70e07a76f14c279f7d5f7a1183d54ffa9a
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
25
26 #ifdef __BORLANDC__
27 #pragma hdrstop
28 #endif
29
30 #ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33 #endif // WX_PRECOMP
34
35 #include "wx/strconv.h"
36
37 #if wxUSE_WCHAR_T
38
39 #ifdef __WINDOWS__
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #endif
43
44 #ifndef __WXWINCE__
45 #include <errno.h>
46 #endif
47
48 #include <ctype.h>
49 #include <string.h>
50 #include <stdlib.h>
51
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
55
56 #ifdef __SALFORDC__
57 #include <clib.h>
58 #endif
59
60 #ifdef HAVE_ICONV
61 #include <iconv.h>
62 #include "wx/thread.h"
63 #endif
64
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
67 #include "wx/utils.h"
68
69 #ifdef __WXMAC__
70 #ifndef __DARWIN__
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
74 #endif
75
76 #include "wx/mac/private.h" // includes mac headers
77 #endif
78
79 #define TRACE_STRCONV _T("strconv")
80
81 #if SIZEOF_WCHAR_T == 2
82 #define WC_UTF16
83 #endif
84
85 // ============================================================================
86 // implementation
87 // ============================================================================
88
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p, size_t n)
91 {
92 while ( n && *p++ == '\0' )
93 n--;
94
95 return n != 0;
96 }
97
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4
100 // ----------------------------------------------------------------------------
101
102
103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
104 {
105 if (input<=0xffff)
106 {
107 if (output)
108 *output = (wxUint16) input;
109 return 1;
110 }
111 else if (input>=0x110000)
112 {
113 return (size_t)-1;
114 }
115 else
116 {
117 if (output)
118 {
119 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
120 *output = (wxUint16) ((input&0x3ff)+0xdc00);
121 }
122 return 2;
123 }
124 }
125
126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
127 {
128 if ((*input<0xd800) || (*input>0xdfff))
129 {
130 output = *input;
131 return 1;
132 }
133 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
134 {
135 output = *input;
136 return (size_t)-1;
137 }
138 else
139 {
140 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
141 return 2;
142 }
143 }
144
145
146 // ----------------------------------------------------------------------------
147 // wxMBConv
148 // ----------------------------------------------------------------------------
149
150 size_t
151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
152 const char *src, size_t srcLen) const
153 {
154 // although new conversion classes are supposed to implement this function
155 // directly, the existins ones only implement the old MB2WC() and so, to
156 // avoid to have to rewrite all conversion classes at once, we provide a
157 // default (but not efficient) implementation of this one in terms of the
158 // old function by copying the input to ensure that it's NUL-terminated and
159 // then using MB2WC() to convert it
160
161 // the number of chars [which would be] written to dst [if it were not NULL]
162 size_t dstWritten = 0;
163
164 // the number of NULs terminating this string
165 size_t nulLen wxDUMMY_INITIALIZE(0);
166
167 // if we were not given the input size we just have to assume that the
168 // string is properly terminated as we have no way of knowing how long it
169 // is anyhow, but if we do have the size check whether there are enough
170 // NULs at the end
171 wxCharBuffer bufTmp;
172 const char *srcEnd;
173 if ( srcLen != (size_t)-1 )
174 {
175 // we need to know how to find the end of this string
176 nulLen = GetMBNulLen();
177 if ( nulLen == wxCONV_FAILED )
178 return wxCONV_FAILED;
179
180 // if there are enough NULs we can avoid the copy
181 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
182 {
183 // make a copy in order to properly NUL-terminate the string
184 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
185 char * const p = bufTmp.data();
186 memcpy(p, src, srcLen);
187 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
188 *s = '\0';
189
190 src = bufTmp;
191 }
192
193 srcEnd = src + srcLen;
194 }
195 else // quit after the first loop iteration
196 {
197 srcEnd = NULL;
198 }
199
200 for ( ;; )
201 {
202 // try to convert the current chunk
203 size_t lenChunk = MB2WC(NULL, src, 0);
204 if ( lenChunk == 0 )
205 {
206 // nothing left in the input string, conversion succeeded;
207 // but still account for the trailing NULL
208 dstWritten++;
209 break;
210 }
211
212 if ( lenChunk == wxCONV_FAILED )
213 return wxCONV_FAILED;
214
215 lenChunk++; // for trailing NUL
216
217 dstWritten += lenChunk;
218
219 if ( dst )
220 {
221 if ( dstWritten > dstLen )
222 return wxCONV_FAILED;
223
224 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
225 return wxCONV_FAILED;
226
227 dst += lenChunk;
228 }
229
230 if ( !srcEnd )
231 {
232 // we convert the entire string in this case, as we suppose that the
233 // string is NUL-terminated and so srcEnd is not used at all
234 break;
235 }
236
237 // advance the input pointer past the end of this chunk
238 while ( NotAllNULs(src, nulLen) )
239 {
240 // notice that we must skip over multiple bytes here as we suppose
241 // that if NUL takes 2 or 4 bytes, then all the other characters do
242 // too and so if advanced by a single byte we might erroneously
243 // detect sequences of NUL bytes in the middle of the input
244 src += nulLen;
245 }
246
247 src += nulLen; // skipping over its terminator as well
248
249 // note that ">=" (and not just "==") is needed here as the terminator
250 // we skipped just above could be inside or just after the buffer
251 // delimited by inEnd
252 if ( src >= srcEnd )
253 break;
254 }
255
256 return dstWritten;
257 }
258
259 size_t
260 wxMBConv::FromWChar(char *dst, size_t dstLen,
261 const wchar_t *src, size_t srcLen) const
262 {
263 // the number of chars [which would be] written to dst [if it were not NULL]
264 size_t dstWritten = 0;
265
266 // make a copy of the input string unless it is already properly
267 // NUL-terminated
268 //
269 // if we don't know its length we have no choice but to assume that it is,
270 // indeed, properly terminated
271 wxWCharBuffer bufTmp;
272 if ( srcLen == (size_t)-1 )
273 {
274 srcLen = wxWcslen(src) + 1;
275 }
276 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
277 {
278 // make a copy in order to properly NUL-terminate the string
279 bufTmp = wxWCharBuffer(srcLen);
280 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
281 src = bufTmp;
282 }
283
284 const size_t lenNul = GetMBNulLen();
285 for ( const wchar_t * const srcEnd = src + srcLen;
286 src < srcEnd;
287 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
288 {
289 // try to convert the current chunk
290 size_t lenChunk = WC2MB(NULL, src, 0);
291
292 if ( lenChunk == wxCONV_FAILED )
293 return wxCONV_FAILED;
294
295 lenChunk += lenNul;
296 dstWritten += lenChunk;
297
298 if ( dst )
299 {
300 if ( dstWritten > dstLen )
301 return wxCONV_FAILED;
302
303 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
304 return wxCONV_FAILED;
305
306 dst += lenChunk;
307 }
308 }
309
310 return dstWritten;
311 }
312
313 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
314 {
315 size_t rc = ToWChar(outBuff, outLen, inBuff);
316 if ( rc != (size_t)wxCONV_FAILED )
317 {
318 // ToWChar() returns the buffer length, i.e. including the trailing
319 // NUL, while this method doesn't take it into account
320 rc--;
321 }
322
323 return rc;
324 }
325
326 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
327 {
328 size_t rc = FromWChar(outBuff, outLen, inBuff);
329 if ( rc != (size_t)wxCONV_FAILED )
330 {
331 rc -= GetMBNulLen();
332 }
333
334 return rc;
335 }
336
337 wxMBConv::~wxMBConv()
338 {
339 // nothing to do here (necessary for Darwin linking probably)
340 }
341
342 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
343 {
344 if ( psz )
345 {
346 // calculate the length of the buffer needed first
347 const size_t nLen = MB2WC(NULL, psz, 0);
348 if ( nLen != (size_t)wxCONV_FAILED )
349 {
350 // now do the actual conversion
351 wxWCharBuffer buf(nLen /* +1 added implicitly */);
352
353 // +1 for the trailing NULL
354 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
355 return buf;
356 }
357 }
358
359 return wxWCharBuffer();
360 }
361
362 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
363 {
364 if ( pwz )
365 {
366 const size_t nLen = WC2MB(NULL, pwz, 0);
367 if ( nLen != (size_t)wxCONV_FAILED )
368 {
369 // extra space for trailing NUL(s)
370 static const size_t extraLen = GetMaxMBNulLen();
371
372 wxCharBuffer buf(nLen + extraLen - 1);
373 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
374 return buf;
375 }
376 }
377
378 return wxCharBuffer();
379 }
380
381 const wxWCharBuffer
382 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
383 {
384 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
385 if ( dstLen != (size_t)wxCONV_FAILED )
386 {
387 wxWCharBuffer wbuf(dstLen - 1);
388 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) )
389 {
390 if ( outLen )
391 *outLen = dstLen - 1;
392 return wbuf;
393 }
394 }
395
396 if ( outLen )
397 *outLen = 0;
398
399 return wxWCharBuffer();
400 }
401
402 const wxCharBuffer
403 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
404 {
405 const size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
406 if ( dstLen != (size_t)wxCONV_FAILED )
407 {
408 wxCharBuffer buf(dstLen - 1);
409 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) )
410 {
411 if ( outLen )
412 *outLen = dstLen - 1;
413
414 return buf;
415 }
416 }
417
418 if ( outLen )
419 *outLen = 0;
420
421 return wxCharBuffer();
422 }
423
424 // ----------------------------------------------------------------------------
425 // wxMBConvLibc
426 // ----------------------------------------------------------------------------
427
428 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
429 {
430 return wxMB2WC(buf, psz, n);
431 }
432
433 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
434 {
435 return wxWC2MB(buf, psz, n);
436 }
437
438 // ----------------------------------------------------------------------------
439 // wxConvBrokenFileNames
440 // ----------------------------------------------------------------------------
441
442 #ifdef __UNIX__
443
444 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
445 {
446 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
447 || wxStricmp(charset, _T("UTF8")) == 0 )
448 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
449 else
450 m_conv = new wxCSConv(charset);
451 }
452
453 #endif // __UNIX__
454
455 // ----------------------------------------------------------------------------
456 // UTF-7
457 // ----------------------------------------------------------------------------
458
459 // Implementation (C) 2004 Fredrik Roubert
460
461 //
462 // BASE64 decoding table
463 //
464 static const unsigned char utf7unb64[] =
465 {
466 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
467 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
468 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
469 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
470 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
471 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
472 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
473 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
474 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
475 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
476 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
477 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
478 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
479 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
480 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
481 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
482 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
483 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
484 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
485 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
486 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
487 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
488 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
489 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
490 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
498 };
499
500 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
501 {
502 size_t len = 0;
503
504 while ( *psz && (!buf || (len < n)) )
505 {
506 unsigned char cc = *psz++;
507 if (cc != '+')
508 {
509 // plain ASCII char
510 if (buf)
511 *buf++ = cc;
512 len++;
513 }
514 else if (*psz == '-')
515 {
516 // encoded plus sign
517 if (buf)
518 *buf++ = cc;
519 len++;
520 psz++;
521 }
522 else // start of BASE64 encoded string
523 {
524 bool lsb, ok;
525 unsigned int d, l;
526 for ( ok = lsb = false, d = 0, l = 0;
527 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
528 psz++ )
529 {
530 d <<= 6;
531 d += cc;
532 for (l += 6; l >= 8; lsb = !lsb)
533 {
534 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
535 if (lsb)
536 {
537 if (buf)
538 *buf++ |= c;
539 len ++;
540 }
541 else
542 {
543 if (buf)
544 *buf = (wchar_t)(c << 8);
545 }
546
547 ok = true;
548 }
549 }
550
551 if ( !ok )
552 {
553 // in valid UTF7 we should have valid characters after '+'
554 return (size_t)-1;
555 }
556
557 if (*psz == '-')
558 psz++;
559 }
560 }
561
562 if ( buf && (len < n) )
563 *buf = '\0';
564
565 return len;
566 }
567
568 //
569 // BASE64 encoding table
570 //
571 static const unsigned char utf7enb64[] =
572 {
573 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
574 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
575 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
576 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
577 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
578 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
579 'w', 'x', 'y', 'z', '0', '1', '2', '3',
580 '4', '5', '6', '7', '8', '9', '+', '/'
581 };
582
583 //
584 // UTF-7 encoding table
585 //
586 // 0 - Set D (directly encoded characters)
587 // 1 - Set O (optional direct characters)
588 // 2 - whitespace characters (optional)
589 // 3 - special characters
590 //
591 static const unsigned char utf7encode[128] =
592 {
593 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
594 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
595 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
597 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
599 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
600 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
601 };
602
603 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
604 {
605 size_t len = 0;
606
607 while (*psz && ((!buf) || (len < n)))
608 {
609 wchar_t cc = *psz++;
610 if (cc < 0x80 && utf7encode[cc] < 1)
611 {
612 // plain ASCII char
613 if (buf)
614 *buf++ = (char)cc;
615
616 len++;
617 }
618 #ifndef WC_UTF16
619 else if (((wxUint32)cc) > 0xffff)
620 {
621 // no surrogate pair generation (yet?)
622 return (size_t)-1;
623 }
624 #endif
625 else
626 {
627 if (buf)
628 *buf++ = '+';
629 len++;
630 if (cc != '+')
631 {
632 // BASE64 encode string
633 unsigned int lsb, d, l;
634 for (d = 0, l = 0; /*nothing*/; psz++)
635 {
636 for (lsb = 0; lsb < 2; lsb ++)
637 {
638 d <<= 8;
639 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
640
641 for (l += 8; l >= 6; )
642 {
643 l -= 6;
644 if (buf)
645 *buf++ = utf7enb64[(d >> l) % 64];
646 len++;
647 }
648 }
649 cc = *psz;
650 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
651 break;
652 }
653 if (l != 0)
654 {
655 if (buf)
656 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
657 len++;
658 }
659 }
660
661 if (buf)
662 *buf++ = '-';
663 len++;
664 }
665 }
666
667 if (buf && (len < n))
668 *buf = 0;
669
670 return len;
671 }
672
673 // ----------------------------------------------------------------------------
674 // UTF-8
675 // ----------------------------------------------------------------------------
676
677 static wxUint32 utf8_max[]=
678 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
679
680 // boundaries of the private use area we use to (temporarily) remap invalid
681 // characters invalid in a UTF-8 encoded string
682 const wxUint32 wxUnicodePUA = 0x100000;
683 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
684
685 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
686 {
687 size_t len = 0;
688
689 while (*psz && ((!buf) || (len < n)))
690 {
691 const char *opsz = psz;
692 bool invalid = false;
693 unsigned char cc = *psz++, fc = cc;
694 unsigned cnt;
695 for (cnt = 0; fc & 0x80; cnt++)
696 fc <<= 1;
697
698 if (!cnt)
699 {
700 // plain ASCII char
701 if (buf)
702 *buf++ = cc;
703 len++;
704
705 // escape the escape character for octal escapes
706 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
707 && cc == '\\' && (!buf || len < n))
708 {
709 if (buf)
710 *buf++ = cc;
711 len++;
712 }
713 }
714 else
715 {
716 cnt--;
717 if (!cnt)
718 {
719 // invalid UTF-8 sequence
720 invalid = true;
721 }
722 else
723 {
724 unsigned ocnt = cnt - 1;
725 wxUint32 res = cc & (0x3f >> cnt);
726 while (cnt--)
727 {
728 cc = *psz;
729 if ((cc & 0xC0) != 0x80)
730 {
731 // invalid UTF-8 sequence
732 invalid = true;
733 break;
734 }
735
736 psz++;
737 res = (res << 6) | (cc & 0x3f);
738 }
739 if (invalid || res <= utf8_max[ocnt])
740 {
741 // illegal UTF-8 encoding
742 invalid = true;
743 }
744 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
745 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
746 {
747 // if one of our PUA characters turns up externally
748 // it must also be treated as an illegal sequence
749 // (a bit like you have to escape an escape character)
750 invalid = true;
751 }
752 else
753 {
754 #ifdef WC_UTF16
755 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
756 size_t pa = encode_utf16(res, (wxUint16 *)buf);
757 if (pa == (size_t)-1)
758 {
759 invalid = true;
760 }
761 else
762 {
763 if (buf)
764 buf += pa;
765 len += pa;
766 }
767 #else // !WC_UTF16
768 if (buf)
769 *buf++ = (wchar_t)res;
770 len++;
771 #endif // WC_UTF16/!WC_UTF16
772 }
773 }
774 if (invalid)
775 {
776 if (m_options & MAP_INVALID_UTF8_TO_PUA)
777 {
778 while (opsz < psz && (!buf || len < n))
779 {
780 #ifdef WC_UTF16
781 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
782 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
783 wxASSERT(pa != (size_t)-1);
784 if (buf)
785 buf += pa;
786 opsz++;
787 len += pa;
788 #else
789 if (buf)
790 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
791 opsz++;
792 len++;
793 #endif
794 }
795 }
796 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
797 {
798 while (opsz < psz && (!buf || len < n))
799 {
800 if ( buf && len + 3 < n )
801 {
802 unsigned char on = *opsz;
803 *buf++ = L'\\';
804 *buf++ = (wchar_t)( L'0' + on / 0100 );
805 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
806 *buf++ = (wchar_t)( L'0' + on % 010 );
807 }
808
809 opsz++;
810 len += 4;
811 }
812 }
813 else // MAP_INVALID_UTF8_NOT
814 {
815 return (size_t)-1;
816 }
817 }
818 }
819 }
820
821 if (buf && (len < n))
822 *buf = 0;
823
824 return len;
825 }
826
827 static inline bool isoctal(wchar_t wch)
828 {
829 return L'0' <= wch && wch <= L'7';
830 }
831
832 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
833 {
834 size_t len = 0;
835
836 while (*psz && ((!buf) || (len < n)))
837 {
838 wxUint32 cc;
839
840 #ifdef WC_UTF16
841 // cast is ok for WC_UTF16
842 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
843 psz += (pa == (size_t)-1) ? 1 : pa;
844 #else
845 cc = (*psz++) & 0x7fffffff;
846 #endif
847
848 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
849 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
850 {
851 if (buf)
852 *buf++ = (char)(cc - wxUnicodePUA);
853 len++;
854 }
855 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
856 && cc == L'\\' && psz[0] == L'\\' )
857 {
858 if (buf)
859 *buf++ = (char)cc;
860 psz++;
861 len++;
862 }
863 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
864 cc == L'\\' &&
865 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
866 {
867 if (buf)
868 {
869 *buf++ = (char) ((psz[0] - L'0') * 0100 +
870 (psz[1] - L'0') * 010 +
871 (psz[2] - L'0'));
872 }
873
874 psz += 3;
875 len++;
876 }
877 else
878 {
879 unsigned cnt;
880 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
881 {
882 }
883
884 if (!cnt)
885 {
886 // plain ASCII char
887 if (buf)
888 *buf++ = (char) cc;
889 len++;
890 }
891
892 else
893 {
894 len += cnt + 1;
895 if (buf)
896 {
897 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
898 while (cnt--)
899 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
900 }
901 }
902 }
903 }
904
905 if (buf && (len < n))
906 *buf = 0;
907
908 return len;
909 }
910
911 // ----------------------------------------------------------------------------
912 // UTF-16
913 // ----------------------------------------------------------------------------
914
915 #ifdef WORDS_BIGENDIAN
916 #define wxMBConvUTF16straight wxMBConvUTF16BE
917 #define wxMBConvUTF16swap wxMBConvUTF16LE
918 #else
919 #define wxMBConvUTF16swap wxMBConvUTF16BE
920 #define wxMBConvUTF16straight wxMBConvUTF16LE
921 #endif
922
923
924 #ifdef WC_UTF16
925
926 // copy 16bit MB to 16bit String
927 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
928 {
929 size_t len = 0;
930
931 while (*(wxUint16*)psz && (!buf || len < n))
932 {
933 if (buf)
934 *buf++ = *(wxUint16*)psz;
935 len++;
936
937 psz += sizeof(wxUint16);
938 }
939
940 if (buf && len < n)
941 *buf = 0;
942
943 return len;
944 }
945
946
947 // copy 16bit String to 16bit MB
948 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
949 {
950 size_t len = 0;
951
952 while (*psz && (!buf || len < n))
953 {
954 if (buf)
955 {
956 *(wxUint16*)buf = *psz;
957 buf += sizeof(wxUint16);
958 }
959
960 len += sizeof(wxUint16);
961 psz++;
962 }
963
964 if (buf && len <= n - sizeof(wxUint16))
965 *(wxUint16*)buf = 0;
966
967 return len;
968 }
969
970
971 // swap 16bit MB to 16bit String
972 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
973 {
974 size_t len = 0;
975
976 // UTF16 string must be terminated by 2 NULs as single NULs may occur
977 // inside the string
978 while ( (psz[0] || psz[1]) && (!buf || len < n) )
979 {
980 if ( buf )
981 {
982 ((char *)buf)[0] = psz[1];
983 ((char *)buf)[1] = psz[0];
984 buf++;
985 }
986 len++;
987 psz += 2;
988 }
989
990 if ( buf && len < n )
991 *buf = L'\0';
992
993 return len;
994 }
995
996
997 // swap 16bit MB to 16bit String
998 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
999 {
1000 size_t len = 0;
1001
1002 while ( *psz && (!buf || len < n) )
1003 {
1004 if ( buf )
1005 {
1006 *buf++ = ((char*)psz)[1];
1007 *buf++ = ((char*)psz)[0];
1008 }
1009
1010 len += 2;
1011 psz++;
1012 }
1013
1014 if ( buf && len < n - 1 )
1015 {
1016 buf[0] =
1017 buf[1] = '\0';
1018 }
1019
1020 return len;
1021 }
1022
1023
1024 #else // WC_UTF16
1025
1026
1027 // copy 16bit MB to 32bit String
1028 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1029 {
1030 size_t len = 0;
1031
1032 while (*(wxUint16*)psz && (!buf || len < n))
1033 {
1034 wxUint32 cc;
1035 size_t pa = decode_utf16((wxUint16*)psz, cc);
1036 if (pa == (size_t)-1)
1037 return pa;
1038
1039 if (buf)
1040 *buf++ = (wchar_t)cc;
1041 len++;
1042 psz += pa * sizeof(wxUint16);
1043 }
1044
1045 if (buf && len < n)
1046 *buf = 0;
1047
1048 return len;
1049 }
1050
1051
1052 // copy 32bit String to 16bit MB
1053 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1054 {
1055 size_t len=0;
1056
1057 while (*psz && (!buf || len < n))
1058 {
1059 wxUint16 cc[2];
1060 size_t pa = encode_utf16(*psz, cc);
1061
1062 if (pa == (size_t)-1)
1063 return pa;
1064
1065 if (buf)
1066 {
1067 *(wxUint16*)buf = cc[0];
1068 buf += sizeof(wxUint16);
1069 if (pa > 1)
1070 {
1071 *(wxUint16*)buf = cc[1];
1072 buf += sizeof(wxUint16);
1073 }
1074 }
1075
1076 len += pa*sizeof(wxUint16);
1077 psz++;
1078 }
1079
1080 if (buf && len <= n - sizeof(wxUint16))
1081 *(wxUint16*)buf = 0;
1082
1083 return len;
1084 }
1085
1086
1087 // swap 16bit MB to 32bit String
1088 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1089 {
1090 size_t len=0;
1091
1092 while (*(wxUint16*)psz && (!buf || len < n))
1093 {
1094 wxUint32 cc;
1095 char tmp[4];
1096
1097 tmp[0] = psz[1];
1098 tmp[1] = psz[0];
1099 tmp[2] = psz[3];
1100 tmp[3] = psz[2];
1101
1102 size_t pa = decode_utf16((wxUint16*)tmp, cc);
1103 if (pa == (size_t)-1)
1104 return pa;
1105
1106 if (buf)
1107 *buf++ = (wchar_t)cc;
1108
1109 len++;
1110 psz += pa * sizeof(wxUint16);
1111 }
1112
1113 if (buf && len < n)
1114 *buf = 0;
1115
1116 return len;
1117 }
1118
1119
1120 // swap 32bit String to 16bit MB
1121 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1122 {
1123 size_t len = 0;
1124
1125 while (*psz && (!buf || len < n))
1126 {
1127 wxUint16 cc[2];
1128 size_t pa = encode_utf16(*psz, cc);
1129
1130 if (pa == (size_t)-1)
1131 return pa;
1132
1133 if (buf)
1134 {
1135 *buf++ = ((char*)cc)[1];
1136 *buf++ = ((char*)cc)[0];
1137 if (pa > 1)
1138 {
1139 *buf++ = ((char*)cc)[3];
1140 *buf++ = ((char*)cc)[2];
1141 }
1142 }
1143
1144 len += pa * sizeof(wxUint16);
1145 psz++;
1146 }
1147
1148 if (buf && len <= n - sizeof(wxUint16))
1149 *(wxUint16*)buf = 0;
1150
1151 return len;
1152 }
1153
1154 #endif // WC_UTF16
1155
1156
1157 // ----------------------------------------------------------------------------
1158 // UTF-32
1159 // ----------------------------------------------------------------------------
1160
1161 #ifdef WORDS_BIGENDIAN
1162 #define wxMBConvUTF32straight wxMBConvUTF32BE
1163 #define wxMBConvUTF32swap wxMBConvUTF32LE
1164 #else
1165 #define wxMBConvUTF32swap wxMBConvUTF32BE
1166 #define wxMBConvUTF32straight wxMBConvUTF32LE
1167 #endif
1168
1169
1170 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1171 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1172
1173
1174 #ifdef WC_UTF16
1175
1176 // copy 32bit MB to 16bit String
1177 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1178 {
1179 size_t len = 0;
1180
1181 while (*(wxUint32*)psz && (!buf || len < n))
1182 {
1183 wxUint16 cc[2];
1184
1185 size_t pa = encode_utf16(*(wxUint32*)psz, cc);
1186 if (pa == (size_t)-1)
1187 return pa;
1188
1189 if (buf)
1190 {
1191 *buf++ = cc[0];
1192 if (pa > 1)
1193 *buf++ = cc[1];
1194 }
1195
1196 len += pa;
1197 psz += sizeof(wxUint32);
1198 }
1199
1200 if (buf && len < n)
1201 *buf = 0;
1202
1203 return len;
1204 }
1205
1206
1207 // copy 16bit String to 32bit MB
1208 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1209 {
1210 size_t len = 0;
1211
1212 while (*psz && (!buf || len < n))
1213 {
1214 wxUint32 cc;
1215
1216 // cast is ok for WC_UTF16
1217 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1218 if (pa == (size_t)-1)
1219 return pa;
1220
1221 if (buf)
1222 {
1223 *(wxUint32*)buf = cc;
1224 buf += sizeof(wxUint32);
1225 }
1226
1227 len += sizeof(wxUint32);
1228 psz += pa;
1229 }
1230
1231 if (buf && len <= n - sizeof(wxUint32))
1232 *(wxUint32*)buf = 0;
1233
1234 return len;
1235 }
1236
1237
1238 // swap 32bit MB to 16bit String
1239 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1240 {
1241 size_t len = 0;
1242
1243 while (*(wxUint32*)psz && (!buf || len < n))
1244 {
1245 char tmp[4];
1246 tmp[0] = psz[3];
1247 tmp[1] = psz[2];
1248 tmp[2] = psz[1];
1249 tmp[3] = psz[0];
1250
1251 wxUint16 cc[2];
1252
1253 size_t pa = encode_utf16(*(wxUint32*)tmp, cc);
1254 if (pa == (size_t)-1)
1255 return pa;
1256
1257 if (buf)
1258 {
1259 *buf++ = cc[0];
1260 if (pa > 1)
1261 *buf++ = cc[1];
1262 }
1263
1264 len += pa;
1265 psz += sizeof(wxUint32);
1266 }
1267
1268 if (buf && len < n)
1269 *buf = 0;
1270
1271 return len;
1272 }
1273
1274
1275 // swap 16bit String to 32bit MB
1276 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1277 {
1278 size_t len = 0;
1279
1280 while (*psz && (!buf || len < n))
1281 {
1282 char cc[4];
1283
1284 // cast is ok for WC_UTF16
1285 size_t pa = decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1286 if (pa == (size_t)-1)
1287 return pa;
1288
1289 if (buf)
1290 {
1291 *buf++ = cc[3];
1292 *buf++ = cc[2];
1293 *buf++ = cc[1];
1294 *buf++ = cc[0];
1295 }
1296
1297 len += sizeof(wxUint32);
1298 psz += pa;
1299 }
1300
1301 if (buf && len <= n - sizeof(wxUint32))
1302 *(wxUint32*)buf = 0;
1303
1304 return len;
1305 }
1306
1307 #else // WC_UTF16
1308
1309
1310 // copy 32bit MB to 32bit String
1311 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1312 {
1313 size_t len=0;
1314
1315 while (*(wxUint32*)psz && (!buf || len < n))
1316 {
1317 if (buf)
1318 *buf++ = (wchar_t)(*(wxUint32*)psz);
1319 len++;
1320 psz += sizeof(wxUint32);
1321 }
1322
1323 if (buf && len < n)
1324 *buf = 0;
1325
1326 return len;
1327 }
1328
1329
1330 // copy 32bit String to 32bit MB
1331 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1332 {
1333 size_t len = 0;
1334
1335 while (*psz && (!buf || len < n))
1336 {
1337 if (buf)
1338 {
1339 *(wxUint32*)buf = *psz;
1340 buf += sizeof(wxUint32);
1341 }
1342
1343 len += sizeof(wxUint32);
1344 psz++;
1345 }
1346
1347 if (buf && len <= n - sizeof(wxUint32))
1348 *(wxUint32*)buf = 0;
1349
1350 return len;
1351 }
1352
1353
1354 // swap 32bit MB to 32bit String
1355 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1356 {
1357 size_t len = 0;
1358
1359 while (*(wxUint32*)psz && (!buf || len < n))
1360 {
1361 if (buf)
1362 {
1363 ((char *)buf)[0] = psz[3];
1364 ((char *)buf)[1] = psz[2];
1365 ((char *)buf)[2] = psz[1];
1366 ((char *)buf)[3] = psz[0];
1367 buf++;
1368 }
1369
1370 len++;
1371 psz += sizeof(wxUint32);
1372 }
1373
1374 if (buf && len < n)
1375 *buf = 0;
1376
1377 return len;
1378 }
1379
1380
1381 // swap 32bit String to 32bit MB
1382 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1383 {
1384 size_t len = 0;
1385
1386 while (*psz && (!buf || len < n))
1387 {
1388 if (buf)
1389 {
1390 *buf++ = ((char *)psz)[3];
1391 *buf++ = ((char *)psz)[2];
1392 *buf++ = ((char *)psz)[1];
1393 *buf++ = ((char *)psz)[0];
1394 }
1395
1396 len += sizeof(wxUint32);
1397 psz++;
1398 }
1399
1400 if (buf && len <= n - sizeof(wxUint32))
1401 *(wxUint32*)buf = 0;
1402
1403 return len;
1404 }
1405
1406
1407 #endif // WC_UTF16
1408
1409
1410 // ============================================================================
1411 // The classes doing conversion using the iconv_xxx() functions
1412 // ============================================================================
1413
1414 #ifdef HAVE_ICONV
1415
1416 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1417 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1418 // (unless there's yet another bug in glibc) the only case when iconv()
1419 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1420 // left in the input buffer -- when _real_ error occurs,
1421 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1422 // iconv() failure.
1423 // [This bug does not appear in glibc 2.2.]
1424 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1425 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1426 (errno != E2BIG || bufLeft != 0))
1427 #else
1428 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1429 #endif
1430
1431 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1432
1433 #define ICONV_T_INVALID ((iconv_t)-1)
1434
1435 #if SIZEOF_WCHAR_T == 4
1436 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1437 #define WC_ENC wxFONTENCODING_UTF32
1438 #elif SIZEOF_WCHAR_T == 2
1439 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1440 #define WC_ENC wxFONTENCODING_UTF16
1441 #else // sizeof(wchar_t) != 2 nor 4
1442 // does this ever happen?
1443 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1444 #endif
1445
1446 // ----------------------------------------------------------------------------
1447 // wxMBConv_iconv: encapsulates an iconv character set
1448 // ----------------------------------------------------------------------------
1449
1450 class wxMBConv_iconv : public wxMBConv
1451 {
1452 public:
1453 wxMBConv_iconv(const wxChar *name);
1454 virtual ~wxMBConv_iconv();
1455
1456 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1457 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1458
1459 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1460 virtual size_t GetMBNulLen() const;
1461
1462 virtual wxMBConv *Clone() const
1463 {
1464 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1465 p->m_minMBCharWidth = m_minMBCharWidth;
1466 return p;
1467 }
1468
1469 bool IsOk() const
1470 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1471
1472 protected:
1473 // the iconv handlers used to translate from multibyte to wide char and in
1474 // the other direction
1475 iconv_t m2w,
1476 w2m;
1477
1478 #if wxUSE_THREADS
1479 // guards access to m2w and w2m objects
1480 wxMutex m_iconvMutex;
1481 #endif
1482
1483 private:
1484 // the name (for iconv_open()) of a wide char charset -- if none is
1485 // available on this machine, it will remain NULL
1486 static wxString ms_wcCharsetName;
1487
1488 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1489 // different endian-ness than the native one
1490 static bool ms_wcNeedsSwap;
1491
1492
1493 // name of the encoding handled by this conversion
1494 wxString m_name;
1495
1496 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1497 // initially
1498 size_t m_minMBCharWidth;
1499 };
1500
1501 // make the constructor available for unit testing
1502 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1503 {
1504 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1505 if ( !result->IsOk() )
1506 {
1507 delete result;
1508 return 0;
1509 }
1510
1511 return result;
1512 }
1513
1514 wxString wxMBConv_iconv::ms_wcCharsetName;
1515 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1516
1517 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1518 : m_name(name)
1519 {
1520 m_minMBCharWidth = 0;
1521
1522 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1523 // names for the charsets
1524 const wxCharBuffer cname(wxString(name).ToAscii());
1525
1526 // check for charset that represents wchar_t:
1527 if ( ms_wcCharsetName.empty() )
1528 {
1529 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1530
1531 #if wxUSE_FONTMAP
1532 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1533 #else // !wxUSE_FONTMAP
1534 static const wxChar *names[] =
1535 {
1536 #if SIZEOF_WCHAR_T == 4
1537 _T("UCS-4"),
1538 #elif SIZEOF_WCHAR_T = 2
1539 _T("UCS-2"),
1540 #endif
1541 NULL
1542 };
1543 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1544
1545 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1546 {
1547 const wxString nameCS(*names);
1548
1549 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1550 wxString nameXE(nameCS);
1551 #ifdef WORDS_BIGENDIAN
1552 nameXE += _T("BE");
1553 #else // little endian
1554 nameXE += _T("LE");
1555 #endif
1556
1557 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1558 nameXE.c_str());
1559
1560 m2w = iconv_open(nameXE.ToAscii(), cname);
1561 if ( m2w == ICONV_T_INVALID )
1562 {
1563 // try charset w/o bytesex info (e.g. "UCS4")
1564 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1565 nameCS.c_str());
1566 m2w = iconv_open(nameCS.ToAscii(), cname);
1567
1568 // and check for bytesex ourselves:
1569 if ( m2w != ICONV_T_INVALID )
1570 {
1571 char buf[2], *bufPtr;
1572 wchar_t wbuf[2], *wbufPtr;
1573 size_t insz, outsz;
1574 size_t res;
1575
1576 buf[0] = 'A';
1577 buf[1] = 0;
1578 wbuf[0] = 0;
1579 insz = 2;
1580 outsz = SIZEOF_WCHAR_T * 2;
1581 wbufPtr = wbuf;
1582 bufPtr = buf;
1583
1584 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1585 (char**)&wbufPtr, &outsz);
1586
1587 if (ICONV_FAILED(res, insz))
1588 {
1589 wxLogLastError(wxT("iconv"));
1590 wxLogError(_("Conversion to charset '%s' doesn't work."),
1591 nameCS.c_str());
1592 }
1593 else // ok, can convert to this encoding, remember it
1594 {
1595 ms_wcCharsetName = nameCS;
1596 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1597 }
1598 }
1599 }
1600 else // use charset not requiring byte swapping
1601 {
1602 ms_wcCharsetName = nameXE;
1603 }
1604 }
1605
1606 wxLogTrace(TRACE_STRCONV,
1607 wxT("iconv wchar_t charset is \"%s\"%s"),
1608 ms_wcCharsetName.empty() ? _T("<none>")
1609 : ms_wcCharsetName.c_str(),
1610 ms_wcNeedsSwap ? _T(" (needs swap)")
1611 : _T(""));
1612 }
1613 else // we already have ms_wcCharsetName
1614 {
1615 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1616 }
1617
1618 if ( ms_wcCharsetName.empty() )
1619 {
1620 w2m = ICONV_T_INVALID;
1621 }
1622 else
1623 {
1624 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1625 if ( w2m == ICONV_T_INVALID )
1626 {
1627 wxLogTrace(TRACE_STRCONV,
1628 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1629 ms_wcCharsetName.c_str(), cname.data());
1630 }
1631 }
1632 }
1633
1634 wxMBConv_iconv::~wxMBConv_iconv()
1635 {
1636 if ( m2w != ICONV_T_INVALID )
1637 iconv_close(m2w);
1638 if ( w2m != ICONV_T_INVALID )
1639 iconv_close(w2m);
1640 }
1641
1642 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1643 {
1644 // find the string length: notice that must be done differently for
1645 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1646 size_t inbuf;
1647 const size_t nulLen = GetMBNulLen();
1648 switch ( nulLen )
1649 {
1650 default:
1651 return (size_t)-1;
1652
1653 case 1:
1654 inbuf = strlen(psz); // arguably more optimized than our version
1655 break;
1656
1657 case 2:
1658 case 4:
1659 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1660 // they also have to start at character boundary and not span two
1661 // adjacent characters
1662 const char *p;
1663 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1664 ;
1665 inbuf = p - psz;
1666 break;
1667 }
1668
1669 #if wxUSE_THREADS
1670 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1671 // Unfortunately there is a couple of global wxCSConv objects such as
1672 // wxConvLocal that are used all over wx code, so we have to make sure
1673 // the handle is used by at most one thread at the time. Otherwise
1674 // only a few wx classes would be safe to use from non-main threads
1675 // as MB<->WC conversion would fail "randomly".
1676 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1677 #endif // wxUSE_THREADS
1678
1679 size_t outbuf = n * SIZEOF_WCHAR_T;
1680 size_t res, cres;
1681 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1682 wchar_t *bufPtr = buf;
1683 const char *pszPtr = psz;
1684
1685 if (buf)
1686 {
1687 // have destination buffer, convert there
1688 cres = iconv(m2w,
1689 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1690 (char**)&bufPtr, &outbuf);
1691 res = n - (outbuf / SIZEOF_WCHAR_T);
1692
1693 if (ms_wcNeedsSwap)
1694 {
1695 // convert to native endianness
1696 for ( unsigned i = 0; i < res; i++ )
1697 buf[n] = WC_BSWAP(buf[i]);
1698 }
1699
1700 // NUL-terminate the string if there is any space left
1701 if (res < n)
1702 buf[res] = 0;
1703 }
1704 else
1705 {
1706 // no destination buffer... convert using temp buffer
1707 // to calculate destination buffer requirement
1708 wchar_t tbuf[8];
1709 res = 0;
1710
1711 do
1712 {
1713 bufPtr = tbuf;
1714 outbuf = 8 * SIZEOF_WCHAR_T;
1715
1716 cres = iconv(m2w,
1717 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1718 (char**)&bufPtr, &outbuf );
1719
1720 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1721 }
1722 while ((cres == (size_t)-1) && (errno == E2BIG));
1723 }
1724
1725 if (ICONV_FAILED(cres, inbuf))
1726 {
1727 //VS: it is ok if iconv fails, hence trace only
1728 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1729 return (size_t)-1;
1730 }
1731
1732 return res;
1733 }
1734
1735 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1736 {
1737 #if wxUSE_THREADS
1738 // NB: explained in MB2WC
1739 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1740 #endif
1741
1742 size_t inlen = wxWcslen(psz);
1743 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1744 size_t outbuf = n;
1745 size_t res, cres;
1746
1747 wchar_t *tmpbuf = 0;
1748
1749 if (ms_wcNeedsSwap)
1750 {
1751 // need to copy to temp buffer to switch endianness
1752 // (doing WC_BSWAP twice on the original buffer won't help, as it
1753 // could be in read-only memory, or be accessed in some other thread)
1754 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1755 for ( size_t i = 0; i < inlen; i++ )
1756 tmpbuf[n] = WC_BSWAP(psz[i]);
1757
1758 tmpbuf[inlen] = L'\0';
1759 psz = tmpbuf;
1760 }
1761
1762 if (buf)
1763 {
1764 // have destination buffer, convert there
1765 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1766
1767 res = n - outbuf;
1768
1769 // NB: iconv was given only wcslen(psz) characters on input, and so
1770 // it couldn't convert the trailing zero. Let's do it ourselves
1771 // if there's some room left for it in the output buffer.
1772 if (res < n)
1773 buf[0] = 0;
1774 }
1775 else
1776 {
1777 // no destination buffer... convert using temp buffer
1778 // to calculate destination buffer requirement
1779 char tbuf[16];
1780 res = 0;
1781 do
1782 {
1783 buf = tbuf;
1784 outbuf = 16;
1785
1786 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1787
1788 res += 16 - outbuf;
1789 }
1790 while ((cres == (size_t)-1) && (errno == E2BIG));
1791 }
1792
1793 if (ms_wcNeedsSwap)
1794 {
1795 free(tmpbuf);
1796 }
1797
1798 if (ICONV_FAILED(cres, inbuf))
1799 {
1800 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1801 return (size_t)-1;
1802 }
1803
1804 return res;
1805 }
1806
1807 size_t wxMBConv_iconv::GetMBNulLen() const
1808 {
1809 if ( m_minMBCharWidth == 0 )
1810 {
1811 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1812
1813 #if wxUSE_THREADS
1814 // NB: explained in MB2WC
1815 wxMutexLocker lock(self->m_iconvMutex);
1816 #endif
1817
1818 wchar_t *wnul = L"";
1819 char buf[8]; // should be enough for NUL in any encoding
1820 size_t inLen = sizeof(wchar_t),
1821 outLen = WXSIZEOF(buf);
1822 char *inBuff = (char *)wnul;
1823 char *outBuff = buf;
1824 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1825 {
1826 self->m_minMBCharWidth = (size_t)-1;
1827 }
1828 else // ok
1829 {
1830 self->m_minMBCharWidth = outBuff - buf;
1831 }
1832 }
1833
1834 return m_minMBCharWidth;
1835 }
1836
1837 #endif // HAVE_ICONV
1838
1839
1840 // ============================================================================
1841 // Win32 conversion classes
1842 // ============================================================================
1843
1844 #ifdef wxHAVE_WIN32_MB2WC
1845
1846 // from utils.cpp
1847 #if wxUSE_FONTMAP
1848 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1849 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1850 #endif
1851
1852 class wxMBConv_win32 : public wxMBConv
1853 {
1854 public:
1855 wxMBConv_win32()
1856 {
1857 m_CodePage = CP_ACP;
1858 m_minMBCharWidth = 0;
1859 }
1860
1861 wxMBConv_win32(const wxMBConv_win32& conv)
1862 {
1863 m_CodePage = conv.m_CodePage;
1864 m_minMBCharWidth = conv.m_minMBCharWidth;
1865 }
1866
1867 #if wxUSE_FONTMAP
1868 wxMBConv_win32(const wxChar* name)
1869 {
1870 m_CodePage = wxCharsetToCodepage(name);
1871 m_minMBCharWidth = 0;
1872 }
1873
1874 wxMBConv_win32(wxFontEncoding encoding)
1875 {
1876 m_CodePage = wxEncodingToCodepage(encoding);
1877 m_minMBCharWidth = 0;
1878 }
1879 #endif // wxUSE_FONTMAP
1880
1881 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1882 {
1883 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1884 // the behaviour is not compatible with the Unix version (using iconv)
1885 // and break the library itself, e.g. wxTextInputStream::NextChar()
1886 // wouldn't work if reading an incomplete MB char didn't result in an
1887 // error
1888 //
1889 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1890 // Win XP or newer and it is not supported for UTF-[78] so we always
1891 // use our own conversions in this case. See
1892 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1893 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1894 if ( m_CodePage == CP_UTF8 )
1895 {
1896 return wxConvUTF8.MB2WC(buf, psz, n);
1897 }
1898
1899 if ( m_CodePage == CP_UTF7 )
1900 {
1901 return wxConvUTF7.MB2WC(buf, psz, n);
1902 }
1903
1904 int flags = 0;
1905 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
1906 IsAtLeastWin2kSP4() )
1907 {
1908 flags = MB_ERR_INVALID_CHARS;
1909 }
1910
1911 const size_t len = ::MultiByteToWideChar
1912 (
1913 m_CodePage, // code page
1914 flags, // flags: fall on error
1915 psz, // input string
1916 -1, // its length (NUL-terminated)
1917 buf, // output string
1918 buf ? n : 0 // size of output buffer
1919 );
1920 if ( !len )
1921 {
1922 // function totally failed
1923 return (size_t)-1;
1924 }
1925
1926 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1927 // check if we succeeded, by doing a double trip:
1928 if ( !flags && buf )
1929 {
1930 const size_t mbLen = strlen(psz);
1931 wxCharBuffer mbBuf(mbLen);
1932 if ( ::WideCharToMultiByte
1933 (
1934 m_CodePage,
1935 0,
1936 buf,
1937 -1,
1938 mbBuf.data(),
1939 mbLen + 1, // size in bytes, not length
1940 NULL,
1941 NULL
1942 ) == 0 ||
1943 strcmp(mbBuf, psz) != 0 )
1944 {
1945 // we didn't obtain the same thing we started from, hence
1946 // the conversion was lossy and we consider that it failed
1947 return (size_t)-1;
1948 }
1949 }
1950
1951 // note that it returns count of written chars for buf != NULL and size
1952 // of the needed buffer for buf == NULL so in either case the length of
1953 // the string (which never includes the terminating NUL) is one less
1954 return len - 1;
1955 }
1956
1957 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1958 {
1959 /*
1960 we have a problem here: by default, WideCharToMultiByte() may
1961 replace characters unrepresentable in the target code page with bad
1962 quality approximations such as turning "1/2" symbol (U+00BD) into
1963 "1" for the code pages which don't have it and we, obviously, want
1964 to avoid this at any price
1965
1966 the trouble is that this function does it _silently_, i.e. it won't
1967 even tell us whether it did or not... Win98/2000 and higher provide
1968 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1969 we have to resort to a round trip, i.e. check that converting back
1970 results in the same string -- this is, of course, expensive but
1971 otherwise we simply can't be sure to not garble the data.
1972 */
1973
1974 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1975 // it doesn't work with CJK encodings (which we test for rather roughly
1976 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1977 // supporting it
1978 BOOL usedDef wxDUMMY_INITIALIZE(false);
1979 BOOL *pUsedDef;
1980 int flags;
1981 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1982 {
1983 // it's our lucky day
1984 flags = WC_NO_BEST_FIT_CHARS;
1985 pUsedDef = &usedDef;
1986 }
1987 else // old system or unsupported encoding
1988 {
1989 flags = 0;
1990 pUsedDef = NULL;
1991 }
1992
1993 const size_t len = ::WideCharToMultiByte
1994 (
1995 m_CodePage, // code page
1996 flags, // either none or no best fit
1997 pwz, // input string
1998 -1, // it is (wide) NUL-terminated
1999 buf, // output buffer
2000 buf ? n : 0, // and its size
2001 NULL, // default "replacement" char
2002 pUsedDef // [out] was it used?
2003 );
2004
2005 if ( !len )
2006 {
2007 // function totally failed
2008 return (size_t)-1;
2009 }
2010
2011 // if we were really converting, check if we succeeded
2012 if ( buf )
2013 {
2014 if ( flags )
2015 {
2016 // check if the conversion failed, i.e. if any replacements
2017 // were done
2018 if ( usedDef )
2019 return (size_t)-1;
2020 }
2021 else // we must resort to double tripping...
2022 {
2023 wxWCharBuffer wcBuf(n);
2024 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2025 wcscmp(wcBuf, pwz) != 0 )
2026 {
2027 // we didn't obtain the same thing we started from, hence
2028 // the conversion was lossy and we consider that it failed
2029 return (size_t)-1;
2030 }
2031 }
2032 }
2033
2034 // see the comment above for the reason of "len - 1"
2035 return len - 1;
2036 }
2037
2038 virtual size_t GetMBNulLen() const
2039 {
2040 if ( m_minMBCharWidth == 0 )
2041 {
2042 int len = ::WideCharToMultiByte
2043 (
2044 m_CodePage, // code page
2045 0, // no flags
2046 L"", // input string
2047 1, // translate just the NUL
2048 NULL, // output buffer
2049 0, // and its size
2050 NULL, // no replacement char
2051 NULL // [out] don't care if it was used
2052 );
2053
2054 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2055 switch ( len )
2056 {
2057 default:
2058 wxLogDebug(_T("Unexpected NUL length %d"), len);
2059 self->m_minMBCharWidth = (size_t)-1;
2060 break;
2061
2062 case 0:
2063 self->m_minMBCharWidth = (size_t)-1;
2064 break;
2065
2066 case 1:
2067 case 2:
2068 case 4:
2069 self->m_minMBCharWidth = len;
2070 break;
2071 }
2072 }
2073
2074 return m_minMBCharWidth;
2075 }
2076
2077 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2078
2079 bool IsOk() const { return m_CodePage != -1; }
2080
2081 private:
2082 static bool CanUseNoBestFit()
2083 {
2084 static int s_isWin98Or2k = -1;
2085
2086 if ( s_isWin98Or2k == -1 )
2087 {
2088 int verMaj, verMin;
2089 switch ( wxGetOsVersion(&verMaj, &verMin) )
2090 {
2091 case wxWIN95:
2092 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2093 break;
2094
2095 case wxWINDOWS_NT:
2096 s_isWin98Or2k = verMaj >= 5;
2097 break;
2098
2099 default:
2100 // unknown, be conservative by default
2101 s_isWin98Or2k = 0;
2102 break;
2103 }
2104
2105 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2106 }
2107
2108 return s_isWin98Or2k == 1;
2109 }
2110
2111 static bool IsAtLeastWin2kSP4()
2112 {
2113 #ifdef __WXWINCE__
2114 return false;
2115 #else
2116 static int s_isAtLeastWin2kSP4 = -1;
2117
2118 if ( s_isAtLeastWin2kSP4 == -1 )
2119 {
2120 OSVERSIONINFOEX ver;
2121
2122 memset(&ver, 0, sizeof(ver));
2123 ver.dwOSVersionInfoSize = sizeof(ver);
2124 GetVersionEx((OSVERSIONINFO*)&ver);
2125
2126 s_isAtLeastWin2kSP4 =
2127 ((ver.dwMajorVersion > 5) || // Vista+
2128 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2129 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2130 ver.wServicePackMajor >= 4)) // 2000 SP4+
2131 ? 1 : 0;
2132 }
2133
2134 return s_isAtLeastWin2kSP4 == 1;
2135 #endif
2136 }
2137
2138
2139 // the code page we're working with
2140 long m_CodePage;
2141
2142 // cached result of GetMBNulLen(), set to 0 initially meaning
2143 // "unknown"
2144 size_t m_minMBCharWidth;
2145 };
2146
2147 #endif // wxHAVE_WIN32_MB2WC
2148
2149 // ============================================================================
2150 // Cocoa conversion classes
2151 // ============================================================================
2152
2153 #if defined(__WXCOCOA__)
2154
2155 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2156 // Strangely enough, internally Core Foundation uses
2157 // UTF 32 internally quite a bit - its just not public (yet).
2158
2159 #include <CoreFoundation/CFString.h>
2160 #include <CoreFoundation/CFStringEncodingExt.h>
2161
2162 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2163 {
2164 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2165
2166 switch (encoding)
2167 {
2168 case wxFONTENCODING_DEFAULT :
2169 enc = CFStringGetSystemEncoding();
2170 break ;
2171
2172 case wxFONTENCODING_ISO8859_1 :
2173 enc = kCFStringEncodingISOLatin1 ;
2174 break ;
2175 case wxFONTENCODING_ISO8859_2 :
2176 enc = kCFStringEncodingISOLatin2;
2177 break ;
2178 case wxFONTENCODING_ISO8859_3 :
2179 enc = kCFStringEncodingISOLatin3 ;
2180 break ;
2181 case wxFONTENCODING_ISO8859_4 :
2182 enc = kCFStringEncodingISOLatin4;
2183 break ;
2184 case wxFONTENCODING_ISO8859_5 :
2185 enc = kCFStringEncodingISOLatinCyrillic;
2186 break ;
2187 case wxFONTENCODING_ISO8859_6 :
2188 enc = kCFStringEncodingISOLatinArabic;
2189 break ;
2190 case wxFONTENCODING_ISO8859_7 :
2191 enc = kCFStringEncodingISOLatinGreek;
2192 break ;
2193 case wxFONTENCODING_ISO8859_8 :
2194 enc = kCFStringEncodingISOLatinHebrew;
2195 break ;
2196 case wxFONTENCODING_ISO8859_9 :
2197 enc = kCFStringEncodingISOLatin5;
2198 break ;
2199 case wxFONTENCODING_ISO8859_10 :
2200 enc = kCFStringEncodingISOLatin6;
2201 break ;
2202 case wxFONTENCODING_ISO8859_11 :
2203 enc = kCFStringEncodingISOLatinThai;
2204 break ;
2205 case wxFONTENCODING_ISO8859_13 :
2206 enc = kCFStringEncodingISOLatin7;
2207 break ;
2208 case wxFONTENCODING_ISO8859_14 :
2209 enc = kCFStringEncodingISOLatin8;
2210 break ;
2211 case wxFONTENCODING_ISO8859_15 :
2212 enc = kCFStringEncodingISOLatin9;
2213 break ;
2214
2215 case wxFONTENCODING_KOI8 :
2216 enc = kCFStringEncodingKOI8_R;
2217 break ;
2218 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2219 enc = kCFStringEncodingDOSRussian;
2220 break ;
2221
2222 // case wxFONTENCODING_BULGARIAN :
2223 // enc = ;
2224 // break ;
2225
2226 case wxFONTENCODING_CP437 :
2227 enc = kCFStringEncodingDOSLatinUS ;
2228 break ;
2229 case wxFONTENCODING_CP850 :
2230 enc = kCFStringEncodingDOSLatin1;
2231 break ;
2232 case wxFONTENCODING_CP852 :
2233 enc = kCFStringEncodingDOSLatin2;
2234 break ;
2235 case wxFONTENCODING_CP855 :
2236 enc = kCFStringEncodingDOSCyrillic;
2237 break ;
2238 case wxFONTENCODING_CP866 :
2239 enc = kCFStringEncodingDOSRussian ;
2240 break ;
2241 case wxFONTENCODING_CP874 :
2242 enc = kCFStringEncodingDOSThai;
2243 break ;
2244 case wxFONTENCODING_CP932 :
2245 enc = kCFStringEncodingDOSJapanese;
2246 break ;
2247 case wxFONTENCODING_CP936 :
2248 enc = kCFStringEncodingDOSChineseSimplif ;
2249 break ;
2250 case wxFONTENCODING_CP949 :
2251 enc = kCFStringEncodingDOSKorean;
2252 break ;
2253 case wxFONTENCODING_CP950 :
2254 enc = kCFStringEncodingDOSChineseTrad;
2255 break ;
2256 case wxFONTENCODING_CP1250 :
2257 enc = kCFStringEncodingWindowsLatin2;
2258 break ;
2259 case wxFONTENCODING_CP1251 :
2260 enc = kCFStringEncodingWindowsCyrillic ;
2261 break ;
2262 case wxFONTENCODING_CP1252 :
2263 enc = kCFStringEncodingWindowsLatin1 ;
2264 break ;
2265 case wxFONTENCODING_CP1253 :
2266 enc = kCFStringEncodingWindowsGreek;
2267 break ;
2268 case wxFONTENCODING_CP1254 :
2269 enc = kCFStringEncodingWindowsLatin5;
2270 break ;
2271 case wxFONTENCODING_CP1255 :
2272 enc = kCFStringEncodingWindowsHebrew ;
2273 break ;
2274 case wxFONTENCODING_CP1256 :
2275 enc = kCFStringEncodingWindowsArabic ;
2276 break ;
2277 case wxFONTENCODING_CP1257 :
2278 enc = kCFStringEncodingWindowsBalticRim;
2279 break ;
2280 // This only really encodes to UTF7 (if that) evidently
2281 // case wxFONTENCODING_UTF7 :
2282 // enc = kCFStringEncodingNonLossyASCII ;
2283 // break ;
2284 case wxFONTENCODING_UTF8 :
2285 enc = kCFStringEncodingUTF8 ;
2286 break ;
2287 case wxFONTENCODING_EUC_JP :
2288 enc = kCFStringEncodingEUC_JP;
2289 break ;
2290 case wxFONTENCODING_UTF16 :
2291 enc = kCFStringEncodingUnicode ;
2292 break ;
2293 case wxFONTENCODING_MACROMAN :
2294 enc = kCFStringEncodingMacRoman ;
2295 break ;
2296 case wxFONTENCODING_MACJAPANESE :
2297 enc = kCFStringEncodingMacJapanese ;
2298 break ;
2299 case wxFONTENCODING_MACCHINESETRAD :
2300 enc = kCFStringEncodingMacChineseTrad ;
2301 break ;
2302 case wxFONTENCODING_MACKOREAN :
2303 enc = kCFStringEncodingMacKorean ;
2304 break ;
2305 case wxFONTENCODING_MACARABIC :
2306 enc = kCFStringEncodingMacArabic ;
2307 break ;
2308 case wxFONTENCODING_MACHEBREW :
2309 enc = kCFStringEncodingMacHebrew ;
2310 break ;
2311 case wxFONTENCODING_MACGREEK :
2312 enc = kCFStringEncodingMacGreek ;
2313 break ;
2314 case wxFONTENCODING_MACCYRILLIC :
2315 enc = kCFStringEncodingMacCyrillic ;
2316 break ;
2317 case wxFONTENCODING_MACDEVANAGARI :
2318 enc = kCFStringEncodingMacDevanagari ;
2319 break ;
2320 case wxFONTENCODING_MACGURMUKHI :
2321 enc = kCFStringEncodingMacGurmukhi ;
2322 break ;
2323 case wxFONTENCODING_MACGUJARATI :
2324 enc = kCFStringEncodingMacGujarati ;
2325 break ;
2326 case wxFONTENCODING_MACORIYA :
2327 enc = kCFStringEncodingMacOriya ;
2328 break ;
2329 case wxFONTENCODING_MACBENGALI :
2330 enc = kCFStringEncodingMacBengali ;
2331 break ;
2332 case wxFONTENCODING_MACTAMIL :
2333 enc = kCFStringEncodingMacTamil ;
2334 break ;
2335 case wxFONTENCODING_MACTELUGU :
2336 enc = kCFStringEncodingMacTelugu ;
2337 break ;
2338 case wxFONTENCODING_MACKANNADA :
2339 enc = kCFStringEncodingMacKannada ;
2340 break ;
2341 case wxFONTENCODING_MACMALAJALAM :
2342 enc = kCFStringEncodingMacMalayalam ;
2343 break ;
2344 case wxFONTENCODING_MACSINHALESE :
2345 enc = kCFStringEncodingMacSinhalese ;
2346 break ;
2347 case wxFONTENCODING_MACBURMESE :
2348 enc = kCFStringEncodingMacBurmese ;
2349 break ;
2350 case wxFONTENCODING_MACKHMER :
2351 enc = kCFStringEncodingMacKhmer ;
2352 break ;
2353 case wxFONTENCODING_MACTHAI :
2354 enc = kCFStringEncodingMacThai ;
2355 break ;
2356 case wxFONTENCODING_MACLAOTIAN :
2357 enc = kCFStringEncodingMacLaotian ;
2358 break ;
2359 case wxFONTENCODING_MACGEORGIAN :
2360 enc = kCFStringEncodingMacGeorgian ;
2361 break ;
2362 case wxFONTENCODING_MACARMENIAN :
2363 enc = kCFStringEncodingMacArmenian ;
2364 break ;
2365 case wxFONTENCODING_MACCHINESESIMP :
2366 enc = kCFStringEncodingMacChineseSimp ;
2367 break ;
2368 case wxFONTENCODING_MACTIBETAN :
2369 enc = kCFStringEncodingMacTibetan ;
2370 break ;
2371 case wxFONTENCODING_MACMONGOLIAN :
2372 enc = kCFStringEncodingMacMongolian ;
2373 break ;
2374 case wxFONTENCODING_MACETHIOPIC :
2375 enc = kCFStringEncodingMacEthiopic ;
2376 break ;
2377 case wxFONTENCODING_MACCENTRALEUR :
2378 enc = kCFStringEncodingMacCentralEurRoman ;
2379 break ;
2380 case wxFONTENCODING_MACVIATNAMESE :
2381 enc = kCFStringEncodingMacVietnamese ;
2382 break ;
2383 case wxFONTENCODING_MACARABICEXT :
2384 enc = kCFStringEncodingMacExtArabic ;
2385 break ;
2386 case wxFONTENCODING_MACSYMBOL :
2387 enc = kCFStringEncodingMacSymbol ;
2388 break ;
2389 case wxFONTENCODING_MACDINGBATS :
2390 enc = kCFStringEncodingMacDingbats ;
2391 break ;
2392 case wxFONTENCODING_MACTURKISH :
2393 enc = kCFStringEncodingMacTurkish ;
2394 break ;
2395 case wxFONTENCODING_MACCROATIAN :
2396 enc = kCFStringEncodingMacCroatian ;
2397 break ;
2398 case wxFONTENCODING_MACICELANDIC :
2399 enc = kCFStringEncodingMacIcelandic ;
2400 break ;
2401 case wxFONTENCODING_MACROMANIAN :
2402 enc = kCFStringEncodingMacRomanian ;
2403 break ;
2404 case wxFONTENCODING_MACCELTIC :
2405 enc = kCFStringEncodingMacCeltic ;
2406 break ;
2407 case wxFONTENCODING_MACGAELIC :
2408 enc = kCFStringEncodingMacGaelic ;
2409 break ;
2410 // case wxFONTENCODING_MACKEYBOARD :
2411 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2412 // break ;
2413
2414 default :
2415 // because gcc is picky
2416 break ;
2417 }
2418
2419 return enc ;
2420 }
2421
2422 class wxMBConv_cocoa : public wxMBConv
2423 {
2424 public:
2425 wxMBConv_cocoa()
2426 {
2427 Init(CFStringGetSystemEncoding()) ;
2428 }
2429
2430 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2431 {
2432 m_encoding = conv.m_encoding;
2433 }
2434
2435 #if wxUSE_FONTMAP
2436 wxMBConv_cocoa(const wxChar* name)
2437 {
2438 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2439 }
2440 #endif
2441
2442 wxMBConv_cocoa(wxFontEncoding encoding)
2443 {
2444 Init( wxCFStringEncFromFontEnc(encoding) );
2445 }
2446
2447 ~wxMBConv_cocoa()
2448 {
2449 }
2450
2451 void Init( CFStringEncoding encoding)
2452 {
2453 m_encoding = encoding ;
2454 }
2455
2456 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2457 {
2458 wxASSERT(szUnConv);
2459
2460 CFStringRef theString = CFStringCreateWithBytes (
2461 NULL, //the allocator
2462 (const UInt8*)szUnConv,
2463 strlen(szUnConv),
2464 m_encoding,
2465 false //no BOM/external representation
2466 );
2467
2468 wxASSERT(theString);
2469
2470 size_t nOutLength = CFStringGetLength(theString);
2471
2472 if (szOut == NULL)
2473 {
2474 CFRelease(theString);
2475 return nOutLength;
2476 }
2477
2478 CFRange theRange = { 0, nOutSize };
2479
2480 #if SIZEOF_WCHAR_T == 4
2481 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2482 #endif
2483
2484 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2485
2486 CFRelease(theString);
2487
2488 szUniCharBuffer[nOutLength] = '\0' ;
2489
2490 #if SIZEOF_WCHAR_T == 4
2491 wxMBConvUTF16 converter ;
2492 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2493 delete[] szUniCharBuffer;
2494 #endif
2495
2496 return nOutLength;
2497 }
2498
2499 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2500 {
2501 wxASSERT(szUnConv);
2502
2503 size_t nRealOutSize;
2504 size_t nBufSize = wxWcslen(szUnConv);
2505 UniChar* szUniBuffer = (UniChar*) szUnConv;
2506
2507 #if SIZEOF_WCHAR_T == 4
2508 wxMBConvUTF16 converter ;
2509 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2510 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2511 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2512 nBufSize /= sizeof(UniChar);
2513 #endif
2514
2515 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2516 NULL, //allocator
2517 szUniBuffer,
2518 nBufSize,
2519 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2520 );
2521
2522 wxASSERT(theString);
2523
2524 //Note that CER puts a BOM when converting to unicode
2525 //so we check and use getchars instead in that case
2526 if (m_encoding == kCFStringEncodingUnicode)
2527 {
2528 if (szOut != NULL)
2529 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2530
2531 nRealOutSize = CFStringGetLength(theString) + 1;
2532 }
2533 else
2534 {
2535 CFStringGetBytes(
2536 theString,
2537 CFRangeMake(0, CFStringGetLength(theString)),
2538 m_encoding,
2539 0, //what to put in characters that can't be converted -
2540 //0 tells CFString to return NULL if it meets such a character
2541 false, //not an external representation
2542 (UInt8*) szOut,
2543 nOutSize,
2544 (CFIndex*) &nRealOutSize
2545 );
2546 }
2547
2548 CFRelease(theString);
2549
2550 #if SIZEOF_WCHAR_T == 4
2551 delete[] szUniBuffer;
2552 #endif
2553
2554 return nRealOutSize - 1;
2555 }
2556
2557 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2558
2559 bool IsOk() const
2560 {
2561 return m_encoding != kCFStringEncodingInvalidId &&
2562 CFStringIsEncodingAvailable(m_encoding);
2563 }
2564
2565 private:
2566 CFStringEncoding m_encoding ;
2567 };
2568
2569 #endif // defined(__WXCOCOA__)
2570
2571 // ============================================================================
2572 // Mac conversion classes
2573 // ============================================================================
2574
2575 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2576
2577 class wxMBConv_mac : public wxMBConv
2578 {
2579 public:
2580 wxMBConv_mac()
2581 {
2582 Init(CFStringGetSystemEncoding()) ;
2583 }
2584
2585 wxMBConv_mac(const wxMBConv_mac& conv)
2586 {
2587 Init(conv.m_char_encoding);
2588 }
2589
2590 #if wxUSE_FONTMAP
2591 wxMBConv_mac(const wxChar* name)
2592 {
2593 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2594 }
2595 #endif
2596
2597 wxMBConv_mac(wxFontEncoding encoding)
2598 {
2599 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2600 }
2601
2602 ~wxMBConv_mac()
2603 {
2604 OSStatus status = noErr ;
2605 status = TECDisposeConverter(m_MB2WC_converter);
2606 status = TECDisposeConverter(m_WC2MB_converter);
2607 }
2608
2609
2610 void Init( TextEncodingBase encoding)
2611 {
2612 OSStatus status = noErr ;
2613 m_char_encoding = encoding ;
2614 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2615
2616 status = TECCreateConverter(&m_MB2WC_converter,
2617 m_char_encoding,
2618 m_unicode_encoding);
2619 status = TECCreateConverter(&m_WC2MB_converter,
2620 m_unicode_encoding,
2621 m_char_encoding);
2622 }
2623
2624 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2625 {
2626 OSStatus status = noErr ;
2627 ByteCount byteOutLen ;
2628 ByteCount byteInLen = strlen(psz) ;
2629 wchar_t *tbuf = NULL ;
2630 UniChar* ubuf = NULL ;
2631 size_t res = 0 ;
2632
2633 if (buf == NULL)
2634 {
2635 //apple specs say at least 32
2636 n = wxMax( 32 , byteInLen ) ;
2637 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2638 }
2639
2640 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2641
2642 #if SIZEOF_WCHAR_T == 4
2643 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2644 #else
2645 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2646 #endif
2647 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2648 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2649 #if SIZEOF_WCHAR_T == 4
2650 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2651 // is not properly terminated we get random characters at the end
2652 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2653 wxMBConvUTF16 converter ;
2654 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2655 free( ubuf ) ;
2656 #else
2657 res = byteOutLen / sizeof( UniChar ) ;
2658 #endif
2659
2660 if ( buf == NULL )
2661 free(tbuf) ;
2662
2663 if ( buf && res < n)
2664 buf[res] = 0;
2665
2666 return res ;
2667 }
2668
2669 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2670 {
2671 OSStatus status = noErr ;
2672 ByteCount byteOutLen ;
2673 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2674
2675 char *tbuf = NULL ;
2676
2677 if (buf == NULL)
2678 {
2679 //apple specs say at least 32
2680 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2681 tbuf = (char*) malloc( n ) ;
2682 }
2683
2684 ByteCount byteBufferLen = n ;
2685 UniChar* ubuf = NULL ;
2686
2687 #if SIZEOF_WCHAR_T == 4
2688 wxMBConvUTF16 converter ;
2689 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2690 byteInLen = unicharlen ;
2691 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2692 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2693 #else
2694 ubuf = (UniChar*) psz ;
2695 #endif
2696
2697 status = TECConvertText(
2698 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2699 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2700
2701 #if SIZEOF_WCHAR_T == 4
2702 free( ubuf ) ;
2703 #endif
2704
2705 if ( buf == NULL )
2706 free(tbuf) ;
2707
2708 size_t res = byteOutLen ;
2709 if ( buf && res < n)
2710 {
2711 buf[res] = 0;
2712
2713 //we need to double-trip to verify it didn't insert any ? in place
2714 //of bogus characters
2715 wxWCharBuffer wcBuf(n);
2716 size_t pszlen = wxWcslen(psz);
2717 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2718 wxWcslen(wcBuf) != pszlen ||
2719 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2720 {
2721 // we didn't obtain the same thing we started from, hence
2722 // the conversion was lossy and we consider that it failed
2723 return (size_t)-1;
2724 }
2725 }
2726
2727 return res ;
2728 }
2729
2730 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2731
2732 bool IsOk() const
2733 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2734
2735 private:
2736 TECObjectRef m_MB2WC_converter;
2737 TECObjectRef m_WC2MB_converter;
2738
2739 TextEncodingBase m_char_encoding;
2740 TextEncodingBase m_unicode_encoding;
2741 };
2742
2743 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2744
2745 // ============================================================================
2746 // wxEncodingConverter based conversion classes
2747 // ============================================================================
2748
2749 #if wxUSE_FONTMAP
2750
2751 class wxMBConv_wxwin : public wxMBConv
2752 {
2753 private:
2754 void Init()
2755 {
2756 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2757 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2758 }
2759
2760 public:
2761 // temporarily just use wxEncodingConverter stuff,
2762 // so that it works while a better implementation is built
2763 wxMBConv_wxwin(const wxChar* name)
2764 {
2765 if (name)
2766 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2767 else
2768 m_enc = wxFONTENCODING_SYSTEM;
2769
2770 Init();
2771 }
2772
2773 wxMBConv_wxwin(wxFontEncoding enc)
2774 {
2775 m_enc = enc;
2776
2777 Init();
2778 }
2779
2780 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2781 {
2782 size_t inbuf = strlen(psz);
2783 if (buf)
2784 {
2785 if (!m2w.Convert(psz, buf))
2786 return (size_t)-1;
2787 }
2788 return inbuf;
2789 }
2790
2791 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2792 {
2793 const size_t inbuf = wxWcslen(psz);
2794 if (buf)
2795 {
2796 if (!w2m.Convert(psz, buf))
2797 return (size_t)-1;
2798 }
2799
2800 return inbuf;
2801 }
2802
2803 virtual size_t GetMBNulLen() const
2804 {
2805 switch ( m_enc )
2806 {
2807 case wxFONTENCODING_UTF16BE:
2808 case wxFONTENCODING_UTF16LE:
2809 return 2;
2810
2811 case wxFONTENCODING_UTF32BE:
2812 case wxFONTENCODING_UTF32LE:
2813 return 4;
2814
2815 default:
2816 return 1;
2817 }
2818 }
2819
2820 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2821
2822 bool IsOk() const { return m_ok; }
2823
2824 public:
2825 wxFontEncoding m_enc;
2826 wxEncodingConverter m2w, w2m;
2827
2828 private:
2829 // were we initialized successfully?
2830 bool m_ok;
2831
2832 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2833 };
2834
2835 // make the constructors available for unit testing
2836 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2837 {
2838 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2839 if ( !result->IsOk() )
2840 {
2841 delete result;
2842 return 0;
2843 }
2844
2845 return result;
2846 }
2847
2848 #endif // wxUSE_FONTMAP
2849
2850 // ============================================================================
2851 // wxCSConv implementation
2852 // ============================================================================
2853
2854 void wxCSConv::Init()
2855 {
2856 m_name = NULL;
2857 m_convReal = NULL;
2858 m_deferred = true;
2859 }
2860
2861 wxCSConv::wxCSConv(const wxChar *charset)
2862 {
2863 Init();
2864
2865 if ( charset )
2866 {
2867 SetName(charset);
2868 }
2869
2870 #if wxUSE_FONTMAP
2871 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2872 #else
2873 m_encoding = wxFONTENCODING_SYSTEM;
2874 #endif
2875 }
2876
2877 wxCSConv::wxCSConv(wxFontEncoding encoding)
2878 {
2879 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2880 {
2881 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2882
2883 encoding = wxFONTENCODING_SYSTEM;
2884 }
2885
2886 Init();
2887
2888 m_encoding = encoding;
2889 }
2890
2891 wxCSConv::~wxCSConv()
2892 {
2893 Clear();
2894 }
2895
2896 wxCSConv::wxCSConv(const wxCSConv& conv)
2897 : wxMBConv()
2898 {
2899 Init();
2900
2901 SetName(conv.m_name);
2902 m_encoding = conv.m_encoding;
2903 }
2904
2905 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2906 {
2907 Clear();
2908
2909 SetName(conv.m_name);
2910 m_encoding = conv.m_encoding;
2911
2912 return *this;
2913 }
2914
2915 void wxCSConv::Clear()
2916 {
2917 free(m_name);
2918 delete m_convReal;
2919
2920 m_name = NULL;
2921 m_convReal = NULL;
2922 }
2923
2924 void wxCSConv::SetName(const wxChar *charset)
2925 {
2926 if (charset)
2927 {
2928 m_name = wxStrdup(charset);
2929 m_deferred = true;
2930 }
2931 }
2932
2933 #if wxUSE_FONTMAP
2934 #include "wx/hashmap.h"
2935
2936 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2937 wxEncodingNameCache );
2938
2939 static wxEncodingNameCache gs_nameCache;
2940 #endif
2941
2942 wxMBConv *wxCSConv::DoCreate() const
2943 {
2944 #if wxUSE_FONTMAP
2945 wxLogTrace(TRACE_STRCONV,
2946 wxT("creating conversion for %s"),
2947 (m_name ? m_name
2948 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2949 #endif // wxUSE_FONTMAP
2950
2951 // check for the special case of ASCII or ISO8859-1 charset: as we have
2952 // special knowledge of it anyhow, we don't need to create a special
2953 // conversion object
2954 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2955 m_encoding == wxFONTENCODING_DEFAULT )
2956 {
2957 // don't convert at all
2958 return NULL;
2959 }
2960
2961 // we trust OS to do conversion better than we can so try external
2962 // conversion methods first
2963 //
2964 // the full order is:
2965 // 1. OS conversion (iconv() under Unix or Win32 API)
2966 // 2. hard coded conversions for UTF
2967 // 3. wxEncodingConverter as fall back
2968
2969 // step (1)
2970 #ifdef HAVE_ICONV
2971 #if !wxUSE_FONTMAP
2972 if ( m_name )
2973 #endif // !wxUSE_FONTMAP
2974 {
2975 wxString name(m_name);
2976 wxFontEncoding encoding(m_encoding);
2977
2978 if ( !name.empty() )
2979 {
2980 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2981 if ( conv->IsOk() )
2982 return conv;
2983
2984 delete conv;
2985
2986 #if wxUSE_FONTMAP
2987 encoding =
2988 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2989 #endif // wxUSE_FONTMAP
2990 }
2991 #if wxUSE_FONTMAP
2992 {
2993 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2994 if ( it != gs_nameCache.end() )
2995 {
2996 if ( it->second.empty() )
2997 return NULL;
2998
2999 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3000 if ( conv->IsOk() )
3001 return conv;
3002
3003 delete conv;
3004 }
3005
3006 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3007
3008 for ( ; *names; ++names )
3009 {
3010 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3011 if ( conv->IsOk() )
3012 {
3013 gs_nameCache[encoding] = *names;
3014 return conv;
3015 }
3016
3017 delete conv;
3018 }
3019
3020 gs_nameCache[encoding] = _T(""); // cache the failure
3021 }
3022 #endif // wxUSE_FONTMAP
3023 }
3024 #endif // HAVE_ICONV
3025
3026 #ifdef wxHAVE_WIN32_MB2WC
3027 {
3028 #if wxUSE_FONTMAP
3029 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3030 : new wxMBConv_win32(m_encoding);
3031 if ( conv->IsOk() )
3032 return conv;
3033
3034 delete conv;
3035 #else
3036 return NULL;
3037 #endif
3038 }
3039 #endif // wxHAVE_WIN32_MB2WC
3040
3041 #if defined(__WXMAC__)
3042 {
3043 // leave UTF16 and UTF32 to the built-ins of wx
3044 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3045 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3046 {
3047 #if wxUSE_FONTMAP
3048 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3049 : new wxMBConv_mac(m_encoding);
3050 #else
3051 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3052 #endif
3053 if ( conv->IsOk() )
3054 return conv;
3055
3056 delete conv;
3057 }
3058 }
3059 #endif
3060
3061 #if defined(__WXCOCOA__)
3062 {
3063 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3064 {
3065 #if wxUSE_FONTMAP
3066 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3067 : new wxMBConv_cocoa(m_encoding);
3068 #else
3069 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3070 #endif
3071
3072 if ( conv->IsOk() )
3073 return conv;
3074
3075 delete conv;
3076 }
3077 }
3078 #endif
3079 // step (2)
3080 wxFontEncoding enc = m_encoding;
3081 #if wxUSE_FONTMAP
3082 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3083 {
3084 // use "false" to suppress interactive dialogs -- we can be called from
3085 // anywhere and popping up a dialog from here is the last thing we want to
3086 // do
3087 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3088 }
3089 #endif // wxUSE_FONTMAP
3090
3091 switch ( enc )
3092 {
3093 case wxFONTENCODING_UTF7:
3094 return new wxMBConvUTF7;
3095
3096 case wxFONTENCODING_UTF8:
3097 return new wxMBConvUTF8;
3098
3099 case wxFONTENCODING_UTF16BE:
3100 return new wxMBConvUTF16BE;
3101
3102 case wxFONTENCODING_UTF16LE:
3103 return new wxMBConvUTF16LE;
3104
3105 case wxFONTENCODING_UTF32BE:
3106 return new wxMBConvUTF32BE;
3107
3108 case wxFONTENCODING_UTF32LE:
3109 return new wxMBConvUTF32LE;
3110
3111 default:
3112 // nothing to do but put here to suppress gcc warnings
3113 break;
3114 }
3115
3116 // step (3)
3117 #if wxUSE_FONTMAP
3118 {
3119 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3120 : new wxMBConv_wxwin(m_encoding);
3121 if ( conv->IsOk() )
3122 return conv;
3123
3124 delete conv;
3125 }
3126 #endif // wxUSE_FONTMAP
3127
3128 // NB: This is a hack to prevent deadlock. What could otherwise happen
3129 // in Unicode build: wxConvLocal creation ends up being here
3130 // because of some failure and logs the error. But wxLog will try to
3131 // attach timestamp, for which it will need wxConvLocal (to convert
3132 // time to char* and then wchar_t*), but that fails, tries to log
3133 // error, but wxLog has a (already locked) critical section that
3134 // guards static buffer.
3135 static bool alreadyLoggingError = false;
3136 if (!alreadyLoggingError)
3137 {
3138 alreadyLoggingError = true;
3139 wxLogError(_("Cannot convert from the charset '%s'!"),
3140 m_name ? m_name
3141 :
3142 #if wxUSE_FONTMAP
3143 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3144 #else // !wxUSE_FONTMAP
3145 wxString::Format(_("encoding %s"), m_encoding).c_str()
3146 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3147 );
3148
3149 alreadyLoggingError = false;
3150 }
3151
3152 return NULL;
3153 }
3154
3155 void wxCSConv::CreateConvIfNeeded() const
3156 {
3157 if ( m_deferred )
3158 {
3159 wxCSConv *self = (wxCSConv *)this; // const_cast
3160
3161 #if wxUSE_INTL
3162 // if we don't have neither the name nor the encoding, use the default
3163 // encoding for this system
3164 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3165 {
3166 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3167 }
3168 #endif // wxUSE_INTL
3169
3170 self->m_convReal = DoCreate();
3171 self->m_deferred = false;
3172 }
3173 }
3174
3175 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3176 {
3177 CreateConvIfNeeded();
3178
3179 if (m_convReal)
3180 return m_convReal->MB2WC(buf, psz, n);
3181
3182 // latin-1 (direct)
3183 size_t len = strlen(psz);
3184
3185 if (buf)
3186 {
3187 for (size_t c = 0; c <= len; c++)
3188 buf[c] = (unsigned char)(psz[c]);
3189 }
3190
3191 return len;
3192 }
3193
3194 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3195 {
3196 CreateConvIfNeeded();
3197
3198 if (m_convReal)
3199 return m_convReal->WC2MB(buf, psz, n);
3200
3201 // latin-1 (direct)
3202 const size_t len = wxWcslen(psz);
3203 if (buf)
3204 {
3205 for (size_t c = 0; c <= len; c++)
3206 {
3207 if (psz[c] > 0xFF)
3208 return (size_t)-1;
3209
3210 buf[c] = (char)psz[c];
3211 }
3212 }
3213 else
3214 {
3215 for (size_t c = 0; c <= len; c++)
3216 {
3217 if (psz[c] > 0xFF)
3218 return (size_t)-1;
3219 }
3220 }
3221
3222 return len;
3223 }
3224
3225 size_t wxCSConv::GetMBNulLen() const
3226 {
3227 CreateConvIfNeeded();
3228
3229 if ( m_convReal )
3230 {
3231 return m_convReal->GetMBNulLen();
3232 }
3233
3234 return 1;
3235 }
3236
3237 // ----------------------------------------------------------------------------
3238 // globals
3239 // ----------------------------------------------------------------------------
3240
3241 #ifdef __WINDOWS__
3242 static wxMBConv_win32 wxConvLibcObj;
3243 #elif defined(__WXMAC__) && !defined(__MACH__)
3244 static wxMBConv_mac wxConvLibcObj ;
3245 #else
3246 static wxMBConvLibc wxConvLibcObj;
3247 #endif
3248
3249 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3250 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3251 static wxMBConvUTF7 wxConvUTF7Obj;
3252 static wxMBConvUTF8 wxConvUTF8Obj;
3253
3254 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3255 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3256 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3257 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3258 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3259 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3260 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3261 #ifdef __WXOSX__
3262 wxConvUTF8Obj;
3263 #else
3264 wxConvLibcObj;
3265 #endif
3266
3267
3268 #else // !wxUSE_WCHAR_T
3269
3270 // stand-ins in absence of wchar_t
3271 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3272 wxConvISO8859_1,
3273 wxConvLocal,
3274 wxConvUTF8;
3275
3276 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T