]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
MB2WC/WC2MB are not pure virtual any longer, implement them in terms of To/FromWChar()
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
25
26 #ifdef __BORLANDC__
27 #pragma hdrstop
28 #endif
29
30 #ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33 #endif // WX_PRECOMP
34
35 #include "wx/strconv.h"
36
37 #if wxUSE_WCHAR_T
38
39 #ifdef __WINDOWS__
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #endif
43
44 #ifndef __WXWINCE__
45 #include <errno.h>
46 #endif
47
48 #include <ctype.h>
49 #include <string.h>
50 #include <stdlib.h>
51
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
55
56 #ifdef __SALFORDC__
57 #include <clib.h>
58 #endif
59
60 #ifdef HAVE_ICONV
61 #include <iconv.h>
62 #include "wx/thread.h"
63 #endif
64
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
67 #include "wx/utils.h"
68
69 #ifdef __WXMAC__
70 #ifndef __DARWIN__
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
74 #endif
75
76 #include "wx/mac/private.h" // includes mac headers
77 #endif
78
79 #define TRACE_STRCONV _T("strconv")
80
81 #if SIZEOF_WCHAR_T == 2
82 #define WC_UTF16
83 #endif
84
85 // ============================================================================
86 // implementation
87 // ============================================================================
88
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p, size_t n)
91 {
92 while ( n && *p++ == '\0' )
93 n--;
94
95 return n != 0;
96 }
97
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4
100 // ----------------------------------------------------------------------------
101
102
103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
104 {
105 if (input<=0xffff)
106 {
107 if (output)
108 *output = (wxUint16) input;
109 return 1;
110 }
111 else if (input>=0x110000)
112 {
113 return (size_t)-1;
114 }
115 else
116 {
117 if (output)
118 {
119 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
120 *output = (wxUint16) ((input&0x3ff)+0xdc00);
121 }
122 return 2;
123 }
124 }
125
126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
127 {
128 if ((*input<0xd800) || (*input>0xdfff))
129 {
130 output = *input;
131 return 1;
132 }
133 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
134 {
135 output = *input;
136 return (size_t)-1;
137 }
138 else
139 {
140 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
141 return 2;
142 }
143 }
144
145
146 // ----------------------------------------------------------------------------
147 // wxMBConv
148 // ----------------------------------------------------------------------------
149
150 size_t
151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
152 const char *src, size_t srcLen) const
153 {
154 // although new conversion classes are supposed to implement this function
155 // directly, the existins ones only implement the old MB2WC() and so, to
156 // avoid to have to rewrite all conversion classes at once, we provide a
157 // default (but not efficient) implementation of this one in terms of the
158 // old function by copying the input to ensure that it's NUL-terminated and
159 // then using MB2WC() to convert it
160
161 // the number of chars [which would be] written to dst [if it were not NULL]
162 size_t dstWritten = 0;
163
164 // the number of NULs terminating this string
165 size_t nulLen wxDUMMY_INITIALIZE(0);
166
167 // if we were not given the input size we just have to assume that the
168 // string is properly terminated as we have no way of knowing how long it
169 // is anyhow, but if we do have the size check whether there are enough
170 // NULs at the end
171 wxCharBuffer bufTmp;
172 const char *srcEnd;
173 if ( srcLen != (size_t)-1 )
174 {
175 // we need to know how to find the end of this string
176 nulLen = GetMBNulLen();
177 if ( nulLen == wxCONV_FAILED )
178 return wxCONV_FAILED;
179
180 // if there are enough NULs we can avoid the copy
181 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
182 {
183 // make a copy in order to properly NUL-terminate the string
184 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
185 char * const p = bufTmp.data();
186 memcpy(p, src, srcLen);
187 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
188 *s = '\0';
189
190 src = bufTmp;
191 }
192
193 srcEnd = src + srcLen;
194 }
195 else // quit after the first loop iteration
196 {
197 srcEnd = NULL;
198 }
199
200 for ( ;; )
201 {
202 // try to convert the current chunk
203 size_t lenChunk = MB2WC(NULL, src, 0);
204 if ( lenChunk == 0 )
205 {
206 // nothing left in the input string, conversion succeeded
207 break;
208 }
209
210 if ( lenChunk == wxCONV_FAILED )
211 return wxCONV_FAILED;
212
213 // if we already have a previous chunk, leave the NUL separating it
214 // from this one
215 if ( dstWritten )
216 {
217 dstWritten++;
218 if ( dst )
219 dst++;
220 }
221
222 dstWritten += lenChunk;
223
224 if ( dst )
225 {
226 if ( dstWritten > dstLen )
227 return wxCONV_FAILED;
228
229 lenChunk = MB2WC(dst, src, lenChunk + 1 /* for NUL */);
230 if ( lenChunk == wxCONV_FAILED )
231 return wxCONV_FAILED;
232
233 dst += lenChunk;
234 }
235
236 if ( !srcEnd )
237 {
238 // we convert the entire string in this cas, as we suppose that the
239 // string is NUL-terminated and so srcEnd is not used at all
240 break;
241 }
242
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src, nulLen) )
245 {
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
250 src += nulLen;
251 }
252
253 src += nulLen; // skipping over its terminator as well
254
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
258 if ( src >= srcEnd )
259 break;
260 }
261
262 return dstWritten;
263 }
264
265 size_t
266 wxMBConv::FromWChar(char *dst, size_t dstLen,
267 const wchar_t *src, size_t srcLen) const
268 {
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten = 0;
271
272 // make a copy of the input string unless it is already properly
273 // NUL-terminated
274 //
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp;
278 if ( srcLen == (size_t)-1 )
279 {
280 srcLen = wxWcslen(src) + 1;
281 }
282 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
283 {
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp = wxWCharBuffer(srcLen);
286 memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
287 src = bufTmp;
288 }
289
290 const size_t lenNul = GetMBNulLen();
291 for ( const wchar_t * const srcEnd = src + srcLen;
292 src < srcEnd;
293 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
294 {
295 // try to convert the current chunk
296 size_t lenChunk = WC2MB(NULL, src, 0);
297
298 if ( lenChunk == wxCONV_FAILED )
299 return wxCONV_FAILED;
300
301 lenChunk += lenNul;
302 dstWritten += lenChunk;
303
304 if ( dst )
305 {
306 if ( dstWritten > dstLen )
307 return wxCONV_FAILED;
308
309 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
310 return wxCONV_FAILED;
311
312 dst += lenChunk;
313 }
314 }
315
316 return dstWritten;
317 }
318
319 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
320 {
321 size_t rc = ToWChar(out, outLen, in);
322 if ( rc != wxCONV_FAILED )
323 {
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
326 rc--;
327 }
328
329 return rc;
330 }
331
332 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
333 {
334 size_t rc = FromWChar(out, outLen, in);
335 if ( rc != wxCONV_FAILED )
336 {
337 rc -= GetMBNulLen();
338 }
339
340 return rc;
341 }
342
343 wxMBConv::~wxMBConv()
344 {
345 // nothing to do here (necessary for Darwin linking probably)
346 }
347
348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
349 {
350 if ( psz )
351 {
352 // calculate the length of the buffer needed first
353 const size_t nLen = MB2WC(NULL, psz, 0);
354 if ( nLen != wxCONV_FAILED )
355 {
356 // now do the actual conversion
357 wxWCharBuffer buf(nLen /* +1 added implicitly */);
358
359 // +1 for the trailing NULL
360 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
361 return buf;
362 }
363 }
364
365 return wxWCharBuffer();
366 }
367
368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
369 {
370 if ( pwz )
371 {
372 const size_t nLen = WC2MB(NULL, pwz, 0);
373 if ( nLen != wxCONV_FAILED )
374 {
375 // extra space for trailing NUL(s)
376 static const size_t extraLen = GetMaxMBNulLen();
377
378 wxCharBuffer buf(nLen + extraLen - 1);
379 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
380 return buf;
381 }
382 }
383
384 return wxCharBuffer();
385 }
386
387 const wxWCharBuffer
388 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
389 {
390 const size_t dstLen = ToWChar(NULL, 0, in, inLen);
391 if ( dstLen != wxCONV_FAILED )
392 {
393 wxWCharBuffer wbuf(dstLen);
394 if ( ToWChar(wbuf.data(), dstLen, in, inLen) )
395 {
396 if ( outLen )
397 *outLen = dstLen;
398 return wbuf;
399 }
400 }
401
402 if ( outLen )
403 *outLen = 0;
404
405 return wxWCharBuffer();
406 }
407
408 const wxCharBuffer
409 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
410 {
411 const size_t dstLen = FromWChar(NULL, 0, in, inLen);
412 if ( dstLen != wxCONV_FAILED )
413 {
414 wxCharBuffer buf(dstLen);
415 if ( FromWChar(buf.data(), dstLen, in, inLen) )
416 {
417 if ( outLen )
418 *outLen = dstLen;
419 return buf;
420 }
421 }
422
423 if ( outLen )
424 *outLen = 0;
425
426 return wxCharBuffer();
427 }
428
429 // ----------------------------------------------------------------------------
430 // wxMBConvLibc
431 // ----------------------------------------------------------------------------
432
433 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
434 {
435 return wxMB2WC(buf, psz, n);
436 }
437
438 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
439 {
440 return wxWC2MB(buf, psz, n);
441 }
442
443 // ----------------------------------------------------------------------------
444 // wxConvBrokenFileNames
445 // ----------------------------------------------------------------------------
446
447 #ifdef __UNIX__
448
449 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
450 {
451 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
452 || wxStricmp(charset, _T("UTF8")) == 0 )
453 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
454 else
455 m_conv = new wxCSConv(charset);
456 }
457
458 #endif // __UNIX__
459
460 // ----------------------------------------------------------------------------
461 // UTF-7
462 // ----------------------------------------------------------------------------
463
464 // Implementation (C) 2004 Fredrik Roubert
465
466 //
467 // BASE64 decoding table
468 //
469 static const unsigned char utf7unb64[] =
470 {
471 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
472 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
473 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
474 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
475 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
476 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
477 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
478 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
479 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
480 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
481 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
482 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
483 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
484 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
485 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
486 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
487 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
488 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
489 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
490 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
503 };
504
505 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
506 {
507 size_t len = 0;
508
509 while ( *psz && (!buf || (len < n)) )
510 {
511 unsigned char cc = *psz++;
512 if (cc != '+')
513 {
514 // plain ASCII char
515 if (buf)
516 *buf++ = cc;
517 len++;
518 }
519 else if (*psz == '-')
520 {
521 // encoded plus sign
522 if (buf)
523 *buf++ = cc;
524 len++;
525 psz++;
526 }
527 else // start of BASE64 encoded string
528 {
529 bool lsb, ok;
530 unsigned int d, l;
531 for ( ok = lsb = false, d = 0, l = 0;
532 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
533 psz++ )
534 {
535 d <<= 6;
536 d += cc;
537 for (l += 6; l >= 8; lsb = !lsb)
538 {
539 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
540 if (lsb)
541 {
542 if (buf)
543 *buf++ |= c;
544 len ++;
545 }
546 else
547 {
548 if (buf)
549 *buf = (wchar_t)(c << 8);
550 }
551
552 ok = true;
553 }
554 }
555
556 if ( !ok )
557 {
558 // in valid UTF7 we should have valid characters after '+'
559 return (size_t)-1;
560 }
561
562 if (*psz == '-')
563 psz++;
564 }
565 }
566
567 if ( buf && (len < n) )
568 *buf = '\0';
569
570 return len;
571 }
572
573 //
574 // BASE64 encoding table
575 //
576 static const unsigned char utf7enb64[] =
577 {
578 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
579 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
580 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
581 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
582 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
583 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
584 'w', 'x', 'y', 'z', '0', '1', '2', '3',
585 '4', '5', '6', '7', '8', '9', '+', '/'
586 };
587
588 //
589 // UTF-7 encoding table
590 //
591 // 0 - Set D (directly encoded characters)
592 // 1 - Set O (optional direct characters)
593 // 2 - whitespace characters (optional)
594 // 3 - special characters
595 //
596 static const unsigned char utf7encode[128] =
597 {
598 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
599 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
600 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
601 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
602 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
603 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
604 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
605 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
606 };
607
608 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
609 {
610 size_t len = 0;
611
612 while (*psz && ((!buf) || (len < n)))
613 {
614 wchar_t cc = *psz++;
615 if (cc < 0x80 && utf7encode[cc] < 1)
616 {
617 // plain ASCII char
618 if (buf)
619 *buf++ = (char)cc;
620 len++;
621 }
622 #ifndef WC_UTF16
623 else if (((wxUint32)cc) > 0xffff)
624 {
625 // no surrogate pair generation (yet?)
626 return (size_t)-1;
627 }
628 #endif
629 else
630 {
631 if (buf)
632 *buf++ = '+';
633 len++;
634 if (cc != '+')
635 {
636 // BASE64 encode string
637 unsigned int lsb, d, l;
638 for (d = 0, l = 0; /*nothing*/; psz++)
639 {
640 for (lsb = 0; lsb < 2; lsb ++)
641 {
642 d <<= 8;
643 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
644
645 for (l += 8; l >= 6; )
646 {
647 l -= 6;
648 if (buf)
649 *buf++ = utf7enb64[(d >> l) % 64];
650 len++;
651 }
652 }
653 cc = *psz;
654 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
655 break;
656 }
657 if (l != 0)
658 {
659 if (buf)
660 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
661 len++;
662 }
663 }
664 if (buf)
665 *buf++ = '-';
666 len++;
667 }
668 }
669 if (buf && (len < n))
670 *buf = 0;
671 return len;
672 }
673
674 // ----------------------------------------------------------------------------
675 // UTF-8
676 // ----------------------------------------------------------------------------
677
678 static wxUint32 utf8_max[]=
679 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
680
681 // boundaries of the private use area we use to (temporarily) remap invalid
682 // characters invalid in a UTF-8 encoded string
683 const wxUint32 wxUnicodePUA = 0x100000;
684 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
685
686 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
687 {
688 size_t len = 0;
689
690 while (*psz && ((!buf) || (len < n)))
691 {
692 const char *opsz = psz;
693 bool invalid = false;
694 unsigned char cc = *psz++, fc = cc;
695 unsigned cnt;
696 for (cnt = 0; fc & 0x80; cnt++)
697 fc <<= 1;
698 if (!cnt)
699 {
700 // plain ASCII char
701 if (buf)
702 *buf++ = cc;
703 len++;
704
705 // escape the escape character for octal escapes
706 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
707 && cc == '\\' && (!buf || len < n))
708 {
709 if (buf)
710 *buf++ = cc;
711 len++;
712 }
713 }
714 else
715 {
716 cnt--;
717 if (!cnt)
718 {
719 // invalid UTF-8 sequence
720 invalid = true;
721 }
722 else
723 {
724 unsigned ocnt = cnt - 1;
725 wxUint32 res = cc & (0x3f >> cnt);
726 while (cnt--)
727 {
728 cc = *psz;
729 if ((cc & 0xC0) != 0x80)
730 {
731 // invalid UTF-8 sequence
732 invalid = true;
733 break;
734 }
735 psz++;
736 res = (res << 6) | (cc & 0x3f);
737 }
738 if (invalid || res <= utf8_max[ocnt])
739 {
740 // illegal UTF-8 encoding
741 invalid = true;
742 }
743 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
744 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
745 {
746 // if one of our PUA characters turns up externally
747 // it must also be treated as an illegal sequence
748 // (a bit like you have to escape an escape character)
749 invalid = true;
750 }
751 else
752 {
753 #ifdef WC_UTF16
754 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
755 size_t pa = encode_utf16(res, (wxUint16 *)buf);
756 if (pa == (size_t)-1)
757 {
758 invalid = true;
759 }
760 else
761 {
762 if (buf)
763 buf += pa;
764 len += pa;
765 }
766 #else // !WC_UTF16
767 if (buf)
768 *buf++ = (wchar_t)res;
769 len++;
770 #endif // WC_UTF16/!WC_UTF16
771 }
772 }
773 if (invalid)
774 {
775 if (m_options & MAP_INVALID_UTF8_TO_PUA)
776 {
777 while (opsz < psz && (!buf || len < n))
778 {
779 #ifdef WC_UTF16
780 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
781 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
782 wxASSERT(pa != (size_t)-1);
783 if (buf)
784 buf += pa;
785 opsz++;
786 len += pa;
787 #else
788 if (buf)
789 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
790 opsz++;
791 len++;
792 #endif
793 }
794 }
795 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
796 {
797 while (opsz < psz && (!buf || len < n))
798 {
799 if ( buf && len + 3 < n )
800 {
801 unsigned char on = *opsz;
802 *buf++ = L'\\';
803 *buf++ = (wchar_t)( L'0' + on / 0100 );
804 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
805 *buf++ = (wchar_t)( L'0' + on % 010 );
806 }
807 opsz++;
808 len += 4;
809 }
810 }
811 else // MAP_INVALID_UTF8_NOT
812 {
813 return (size_t)-1;
814 }
815 }
816 }
817 }
818 if (buf && (len < n))
819 *buf = 0;
820 return len;
821 }
822
823 static inline bool isoctal(wchar_t wch)
824 {
825 return L'0' <= wch && wch <= L'7';
826 }
827
828 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
829 {
830 size_t len = 0;
831
832 while (*psz && ((!buf) || (len < n)))
833 {
834 wxUint32 cc;
835 #ifdef WC_UTF16
836 // cast is ok for WC_UTF16
837 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
838 psz += (pa == (size_t)-1) ? 1 : pa;
839 #else
840 cc=(*psz++) & 0x7fffffff;
841 #endif
842
843 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
844 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
845 {
846 if (buf)
847 *buf++ = (char)(cc - wxUnicodePUA);
848 len++;
849 }
850 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
851 && cc == L'\\' && psz[0] == L'\\' )
852 {
853 if (buf)
854 *buf++ = (char)cc;
855 psz++;
856 len++;
857 }
858 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
859 cc == L'\\' &&
860 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
861 {
862 if (buf)
863 {
864 *buf++ = (char) ((psz[0] - L'0')*0100 +
865 (psz[1] - L'0')*010 +
866 (psz[2] - L'0'));
867 }
868
869 psz += 3;
870 len++;
871 }
872 else
873 {
874 unsigned cnt;
875 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
876 if (!cnt)
877 {
878 // plain ASCII char
879 if (buf)
880 *buf++ = (char) cc;
881 len++;
882 }
883
884 else
885 {
886 len += cnt + 1;
887 if (buf)
888 {
889 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
890 while (cnt--)
891 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
892 }
893 }
894 }
895 }
896
897 if (buf && (len<n))
898 *buf = 0;
899
900 return len;
901 }
902
903 // ----------------------------------------------------------------------------
904 // UTF-16
905 // ----------------------------------------------------------------------------
906
907 #ifdef WORDS_BIGENDIAN
908 #define wxMBConvUTF16straight wxMBConvUTF16BE
909 #define wxMBConvUTF16swap wxMBConvUTF16LE
910 #else
911 #define wxMBConvUTF16swap wxMBConvUTF16BE
912 #define wxMBConvUTF16straight wxMBConvUTF16LE
913 #endif
914
915
916 #ifdef WC_UTF16
917
918 // copy 16bit MB to 16bit String
919 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
920 {
921 size_t len=0;
922
923 while (*(wxUint16*)psz && (!buf || len < n))
924 {
925 if (buf)
926 *buf++ = *(wxUint16*)psz;
927 len++;
928
929 psz += sizeof(wxUint16);
930 }
931 if (buf && len<n) *buf=0;
932
933 return len;
934 }
935
936
937 // copy 16bit String to 16bit MB
938 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
939 {
940 size_t len=0;
941
942 while (*psz && (!buf || len < n))
943 {
944 if (buf)
945 {
946 *(wxUint16*)buf = *psz;
947 buf += sizeof(wxUint16);
948 }
949 len += sizeof(wxUint16);
950 psz++;
951 }
952 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
953
954 return len;
955 }
956
957
958 // swap 16bit MB to 16bit String
959 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
960 {
961 size_t len = 0;
962
963 // UTF16 string must be terminated by 2 NULs as single NULs may occur
964 // inside the string
965 while ( (psz[0] || psz[1]) && (!buf || len < n) )
966 {
967 if ( buf )
968 {
969 ((char *)buf)[0] = psz[1];
970 ((char *)buf)[1] = psz[0];
971 buf++;
972 }
973 len++;
974 psz += 2;
975 }
976
977 if ( buf && len < n )
978 *buf = L'\0';
979
980 return len;
981 }
982
983
984 // swap 16bit MB to 16bit String
985 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
986 {
987 size_t len = 0;
988
989 while ( *psz && (!buf || len < n) )
990 {
991 if ( buf )
992 {
993 *buf++ = ((char*)psz)[1];
994 *buf++ = ((char*)psz)[0];
995 }
996 len += 2;
997 psz++;
998 }
999
1000 if ( buf && len < n )
1001 *buf = '\0';
1002
1003 return len;
1004 }
1005
1006
1007 #else // WC_UTF16
1008
1009
1010 // copy 16bit MB to 32bit String
1011 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1012 {
1013 size_t len=0;
1014
1015 while (*(wxUint16*)psz && (!buf || len < n))
1016 {
1017 wxUint32 cc;
1018 size_t pa=decode_utf16((wxUint16*)psz, cc);
1019 if (pa == (size_t)-1)
1020 return pa;
1021
1022 if (buf)
1023 *buf++ = (wchar_t)cc;
1024 len++;
1025 psz += pa * sizeof(wxUint16);
1026 }
1027 if (buf && len<n) *buf=0;
1028
1029 return len;
1030 }
1031
1032
1033 // copy 32bit String to 16bit MB
1034 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1035 {
1036 size_t len=0;
1037
1038 while (*psz && (!buf || len < n))
1039 {
1040 wxUint16 cc[2];
1041 size_t pa=encode_utf16(*psz, cc);
1042
1043 if (pa == (size_t)-1)
1044 return pa;
1045
1046 if (buf)
1047 {
1048 *(wxUint16*)buf = cc[0];
1049 buf += sizeof(wxUint16);
1050 if (pa > 1)
1051 {
1052 *(wxUint16*)buf = cc[1];
1053 buf += sizeof(wxUint16);
1054 }
1055 }
1056
1057 len += pa*sizeof(wxUint16);
1058 psz++;
1059 }
1060 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1061
1062 return len;
1063 }
1064
1065
1066 // swap 16bit MB to 32bit String
1067 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1068 {
1069 size_t len=0;
1070
1071 while (*(wxUint16*)psz && (!buf || len < n))
1072 {
1073 wxUint32 cc;
1074 char tmp[4];
1075 tmp[0]=psz[1]; tmp[1]=psz[0];
1076 tmp[2]=psz[3]; tmp[3]=psz[2];
1077
1078 size_t pa=decode_utf16((wxUint16*)tmp, cc);
1079 if (pa == (size_t)-1)
1080 return pa;
1081
1082 if (buf)
1083 *buf++ = (wchar_t)cc;
1084
1085 len++;
1086 psz += pa * sizeof(wxUint16);
1087 }
1088 if (buf && len<n) *buf=0;
1089
1090 return len;
1091 }
1092
1093
1094 // swap 32bit String to 16bit MB
1095 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1096 {
1097 size_t len=0;
1098
1099 while (*psz && (!buf || len < n))
1100 {
1101 wxUint16 cc[2];
1102 size_t pa=encode_utf16(*psz, cc);
1103
1104 if (pa == (size_t)-1)
1105 return pa;
1106
1107 if (buf)
1108 {
1109 *buf++ = ((char*)cc)[1];
1110 *buf++ = ((char*)cc)[0];
1111 if (pa > 1)
1112 {
1113 *buf++ = ((char*)cc)[3];
1114 *buf++ = ((char*)cc)[2];
1115 }
1116 }
1117
1118 len += pa*sizeof(wxUint16);
1119 psz++;
1120 }
1121 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1122
1123 return len;
1124 }
1125
1126 #endif // WC_UTF16
1127
1128
1129 // ----------------------------------------------------------------------------
1130 // UTF-32
1131 // ----------------------------------------------------------------------------
1132
1133 #ifdef WORDS_BIGENDIAN
1134 #define wxMBConvUTF32straight wxMBConvUTF32BE
1135 #define wxMBConvUTF32swap wxMBConvUTF32LE
1136 #else
1137 #define wxMBConvUTF32swap wxMBConvUTF32BE
1138 #define wxMBConvUTF32straight wxMBConvUTF32LE
1139 #endif
1140
1141
1142 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1143 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1144
1145
1146 #ifdef WC_UTF16
1147
1148 // copy 32bit MB to 16bit String
1149 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1150 {
1151 size_t len=0;
1152
1153 while (*(wxUint32*)psz && (!buf || len < n))
1154 {
1155 wxUint16 cc[2];
1156
1157 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1158 if (pa == (size_t)-1)
1159 return pa;
1160
1161 if (buf)
1162 {
1163 *buf++ = cc[0];
1164 if (pa > 1)
1165 *buf++ = cc[1];
1166 }
1167 len += pa;
1168 psz += sizeof(wxUint32);
1169 }
1170 if (buf && len<n) *buf=0;
1171
1172 return len;
1173 }
1174
1175
1176 // copy 16bit String to 32bit MB
1177 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1178 {
1179 size_t len=0;
1180
1181 while (*psz && (!buf || len < n))
1182 {
1183 wxUint32 cc;
1184
1185 // cast is ok for WC_UTF16
1186 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1187 if (pa == (size_t)-1)
1188 return pa;
1189
1190 if (buf)
1191 {
1192 *(wxUint32*)buf = cc;
1193 buf += sizeof(wxUint32);
1194 }
1195 len += sizeof(wxUint32);
1196 psz += pa;
1197 }
1198
1199 if (buf && len<=n-sizeof(wxUint32))
1200 *(wxUint32*)buf=0;
1201
1202 return len;
1203 }
1204
1205
1206
1207 // swap 32bit MB to 16bit String
1208 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1209 {
1210 size_t len=0;
1211
1212 while (*(wxUint32*)psz && (!buf || len < n))
1213 {
1214 char tmp[4];
1215 tmp[0] = psz[3]; tmp[1] = psz[2];
1216 tmp[2] = psz[1]; tmp[3] = psz[0];
1217
1218
1219 wxUint16 cc[2];
1220
1221 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1222 if (pa == (size_t)-1)
1223 return pa;
1224
1225 if (buf)
1226 {
1227 *buf++ = cc[0];
1228 if (pa > 1)
1229 *buf++ = cc[1];
1230 }
1231 len += pa;
1232 psz += sizeof(wxUint32);
1233 }
1234
1235 if (buf && len<n)
1236 *buf=0;
1237
1238 return len;
1239 }
1240
1241
1242 // swap 16bit String to 32bit MB
1243 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1244 {
1245 size_t len=0;
1246
1247 while (*psz && (!buf || len < n))
1248 {
1249 char cc[4];
1250
1251 // cast is ok for WC_UTF16
1252 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1253 if (pa == (size_t)-1)
1254 return pa;
1255
1256 if (buf)
1257 {
1258 *buf++ = cc[3];
1259 *buf++ = cc[2];
1260 *buf++ = cc[1];
1261 *buf++ = cc[0];
1262 }
1263 len += sizeof(wxUint32);
1264 psz += pa;
1265 }
1266
1267 if (buf && len<=n-sizeof(wxUint32))
1268 *(wxUint32*)buf=0;
1269
1270 return len;
1271 }
1272
1273 #else // WC_UTF16
1274
1275
1276 // copy 32bit MB to 32bit String
1277 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1278 {
1279 size_t len=0;
1280
1281 while (*(wxUint32*)psz && (!buf || len < n))
1282 {
1283 if (buf)
1284 *buf++ = (wchar_t)(*(wxUint32*)psz);
1285 len++;
1286 psz += sizeof(wxUint32);
1287 }
1288
1289 if (buf && len<n)
1290 *buf=0;
1291
1292 return len;
1293 }
1294
1295
1296 // copy 32bit String to 32bit MB
1297 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1298 {
1299 size_t len=0;
1300
1301 while (*psz && (!buf || len < n))
1302 {
1303 if (buf)
1304 {
1305 *(wxUint32*)buf = *psz;
1306 buf += sizeof(wxUint32);
1307 }
1308
1309 len += sizeof(wxUint32);
1310 psz++;
1311 }
1312
1313 if (buf && len<=n-sizeof(wxUint32))
1314 *(wxUint32*)buf=0;
1315
1316 return len;
1317 }
1318
1319
1320 // swap 32bit MB to 32bit String
1321 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1322 {
1323 size_t len=0;
1324
1325 while (*(wxUint32*)psz && (!buf || len < n))
1326 {
1327 if (buf)
1328 {
1329 ((char *)buf)[0] = psz[3];
1330 ((char *)buf)[1] = psz[2];
1331 ((char *)buf)[2] = psz[1];
1332 ((char *)buf)[3] = psz[0];
1333 buf++;
1334 }
1335 len++;
1336 psz += sizeof(wxUint32);
1337 }
1338
1339 if (buf && len<n)
1340 *buf=0;
1341
1342 return len;
1343 }
1344
1345
1346 // swap 32bit String to 32bit MB
1347 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1348 {
1349 size_t len=0;
1350
1351 while (*psz && (!buf || len < n))
1352 {
1353 if (buf)
1354 {
1355 *buf++ = ((char *)psz)[3];
1356 *buf++ = ((char *)psz)[2];
1357 *buf++ = ((char *)psz)[1];
1358 *buf++ = ((char *)psz)[0];
1359 }
1360 len += sizeof(wxUint32);
1361 psz++;
1362 }
1363
1364 if (buf && len<=n-sizeof(wxUint32))
1365 *(wxUint32*)buf=0;
1366
1367 return len;
1368 }
1369
1370
1371 #endif // WC_UTF16
1372
1373
1374 // ============================================================================
1375 // The classes doing conversion using the iconv_xxx() functions
1376 // ============================================================================
1377
1378 #ifdef HAVE_ICONV
1379
1380 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1381 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1382 // (unless there's yet another bug in glibc) the only case when iconv()
1383 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1384 // left in the input buffer -- when _real_ error occurs,
1385 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1386 // iconv() failure.
1387 // [This bug does not appear in glibc 2.2.]
1388 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1389 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1390 (errno != E2BIG || bufLeft != 0))
1391 #else
1392 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1393 #endif
1394
1395 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1396
1397 #define ICONV_T_INVALID ((iconv_t)-1)
1398
1399 #if SIZEOF_WCHAR_T == 4
1400 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1401 #define WC_ENC wxFONTENCODING_UTF32
1402 #elif SIZEOF_WCHAR_T == 2
1403 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1404 #define WC_ENC wxFONTENCODING_UTF16
1405 #else // sizeof(wchar_t) != 2 nor 4
1406 // does this ever happen?
1407 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1408 #endif
1409
1410 // ----------------------------------------------------------------------------
1411 // wxMBConv_iconv: encapsulates an iconv character set
1412 // ----------------------------------------------------------------------------
1413
1414 class wxMBConv_iconv : public wxMBConv
1415 {
1416 public:
1417 wxMBConv_iconv(const wxChar *name);
1418 virtual ~wxMBConv_iconv();
1419
1420 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1421 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1422
1423 // classify this encoding as explained in wxMBConv::GetMBNulLen()
1424 // comment
1425 virtual size_t GetMBNulLen() const;
1426
1427 bool IsOk() const
1428 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1429
1430 protected:
1431 // the iconv handlers used to translate from multibyte to wide char and in
1432 // the other direction
1433 iconv_t m2w,
1434 w2m;
1435 #if wxUSE_THREADS
1436 // guards access to m2w and w2m objects
1437 wxMutex m_iconvMutex;
1438 #endif
1439
1440 private:
1441 // the name (for iconv_open()) of a wide char charset -- if none is
1442 // available on this machine, it will remain NULL
1443 static wxString ms_wcCharsetName;
1444
1445 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1446 // different endian-ness than the native one
1447 static bool ms_wcNeedsSwap;
1448
1449 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1450 // initially
1451 size_t m_minMBCharWidth;
1452 };
1453
1454 // make the constructor available for unit testing
1455 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1456 {
1457 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1458 if ( !result->IsOk() )
1459 {
1460 delete result;
1461 return 0;
1462 }
1463 return result;
1464 }
1465
1466 wxString wxMBConv_iconv::ms_wcCharsetName;
1467 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1468
1469 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1470 {
1471 m_minMBCharWidth = 0;
1472
1473 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1474 // names for the charsets
1475 const wxCharBuffer cname(wxString(name).ToAscii());
1476
1477 // check for charset that represents wchar_t:
1478 if ( ms_wcCharsetName.empty() )
1479 {
1480 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1481
1482 #if wxUSE_FONTMAP
1483 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1484 #else // !wxUSE_FONTMAP
1485 static const wxChar *names[] =
1486 {
1487 #if SIZEOF_WCHAR_T == 4
1488 _T("UCS-4"),
1489 #elif SIZEOF_WCHAR_T = 2
1490 _T("UCS-2"),
1491 #endif
1492 NULL
1493 };
1494 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1495
1496 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1497 {
1498 const wxString nameCS(*names);
1499
1500 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1501 wxString nameXE(nameCS);
1502 #ifdef WORDS_BIGENDIAN
1503 nameXE += _T("BE");
1504 #else // little endian
1505 nameXE += _T("LE");
1506 #endif
1507
1508 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1509 nameXE.c_str());
1510
1511 m2w = iconv_open(nameXE.ToAscii(), cname);
1512 if ( m2w == ICONV_T_INVALID )
1513 {
1514 // try charset w/o bytesex info (e.g. "UCS4")
1515 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1516 nameCS.c_str());
1517 m2w = iconv_open(nameCS.ToAscii(), cname);
1518
1519 // and check for bytesex ourselves:
1520 if ( m2w != ICONV_T_INVALID )
1521 {
1522 char buf[2], *bufPtr;
1523 wchar_t wbuf[2], *wbufPtr;
1524 size_t insz, outsz;
1525 size_t res;
1526
1527 buf[0] = 'A';
1528 buf[1] = 0;
1529 wbuf[0] = 0;
1530 insz = 2;
1531 outsz = SIZEOF_WCHAR_T * 2;
1532 wbufPtr = wbuf;
1533 bufPtr = buf;
1534
1535 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1536 (char**)&wbufPtr, &outsz);
1537
1538 if (ICONV_FAILED(res, insz))
1539 {
1540 wxLogLastError(wxT("iconv"));
1541 wxLogError(_("Conversion to charset '%s' doesn't work."),
1542 nameCS.c_str());
1543 }
1544 else // ok, can convert to this encoding, remember it
1545 {
1546 ms_wcCharsetName = nameCS;
1547 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1548 }
1549 }
1550 }
1551 else // use charset not requiring byte swapping
1552 {
1553 ms_wcCharsetName = nameXE;
1554 }
1555 }
1556
1557 wxLogTrace(TRACE_STRCONV,
1558 wxT("iconv wchar_t charset is \"%s\"%s"),
1559 ms_wcCharsetName.empty() ? _T("<none>")
1560 : ms_wcCharsetName.c_str(),
1561 ms_wcNeedsSwap ? _T(" (needs swap)")
1562 : _T(""));
1563 }
1564 else // we already have ms_wcCharsetName
1565 {
1566 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1567 }
1568
1569 if ( ms_wcCharsetName.empty() )
1570 {
1571 w2m = ICONV_T_INVALID;
1572 }
1573 else
1574 {
1575 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1576 if ( w2m == ICONV_T_INVALID )
1577 {
1578 wxLogTrace(TRACE_STRCONV,
1579 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1580 ms_wcCharsetName.c_str(), cname.data());
1581 }
1582 }
1583 }
1584
1585 wxMBConv_iconv::~wxMBConv_iconv()
1586 {
1587 if ( m2w != ICONV_T_INVALID )
1588 iconv_close(m2w);
1589 if ( w2m != ICONV_T_INVALID )
1590 iconv_close(w2m);
1591 }
1592
1593 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1594 {
1595 // find the string length: notice that must be done differently for
1596 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1597 size_t inbuf;
1598 const size_t nulLen = GetMBNulLen();
1599 switch ( nulLen )
1600 {
1601 default:
1602 return (size_t)-1;
1603
1604 case 1:
1605 inbuf = strlen(psz); // arguably more optimized than our version
1606 break;
1607
1608 case 2:
1609 case 4:
1610 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1611 // they also have to start at character boundary and not span two
1612 // adjacent characters
1613 const char *p;
1614 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1615 ;
1616 inbuf = p - psz;
1617 break;
1618 }
1619
1620 #if wxUSE_THREADS
1621 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1622 // Unfortunately there is a couple of global wxCSConv objects such as
1623 // wxConvLocal that are used all over wx code, so we have to make sure
1624 // the handle is used by at most one thread at the time. Otherwise
1625 // only a few wx classes would be safe to use from non-main threads
1626 // as MB<->WC conversion would fail "randomly".
1627 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1628 #endif // wxUSE_THREADS
1629
1630
1631 size_t outbuf = n * SIZEOF_WCHAR_T;
1632 size_t res, cres;
1633 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1634 wchar_t *bufPtr = buf;
1635 const char *pszPtr = psz;
1636
1637 if (buf)
1638 {
1639 // have destination buffer, convert there
1640 cres = iconv(m2w,
1641 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1642 (char**)&bufPtr, &outbuf);
1643 res = n - (outbuf / SIZEOF_WCHAR_T);
1644
1645 if (ms_wcNeedsSwap)
1646 {
1647 // convert to native endianness
1648 for ( unsigned i = 0; i < res; i++ )
1649 buf[n] = WC_BSWAP(buf[i]);
1650 }
1651
1652 // NUL-terminate the string if there is any space left
1653 if (res < n)
1654 buf[res] = 0;
1655 }
1656 else
1657 {
1658 // no destination buffer... convert using temp buffer
1659 // to calculate destination buffer requirement
1660 wchar_t tbuf[8];
1661 res = 0;
1662 do {
1663 bufPtr = tbuf;
1664 outbuf = 8*SIZEOF_WCHAR_T;
1665
1666 cres = iconv(m2w,
1667 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1668 (char**)&bufPtr, &outbuf );
1669
1670 res += 8-(outbuf/SIZEOF_WCHAR_T);
1671 } while ((cres==(size_t)-1) && (errno==E2BIG));
1672 }
1673
1674 if (ICONV_FAILED(cres, inbuf))
1675 {
1676 //VS: it is ok if iconv fails, hence trace only
1677 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1678 return (size_t)-1;
1679 }
1680
1681 return res;
1682 }
1683
1684 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1685 {
1686 #if wxUSE_THREADS
1687 // NB: explained in MB2WC
1688 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1689 #endif
1690
1691 size_t inlen = wxWcslen(psz);
1692 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1693 size_t outbuf = n;
1694 size_t res, cres;
1695
1696 wchar_t *tmpbuf = 0;
1697
1698 if (ms_wcNeedsSwap)
1699 {
1700 // need to copy to temp buffer to switch endianness
1701 // (doing WC_BSWAP twice on the original buffer won't help, as it
1702 // could be in read-only memory, or be accessed in some other thread)
1703 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1704 for ( size_t i = 0; i < inlen; i++ )
1705 tmpbuf[n] = WC_BSWAP(psz[i]);
1706 tmpbuf[inlen] = L'\0';
1707 psz = tmpbuf;
1708 }
1709
1710 if (buf)
1711 {
1712 // have destination buffer, convert there
1713 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1714
1715 res = n-outbuf;
1716
1717 // NB: iconv was given only wcslen(psz) characters on input, and so
1718 // it couldn't convert the trailing zero. Let's do it ourselves
1719 // if there's some room left for it in the output buffer.
1720 if (res < n)
1721 buf[0] = 0;
1722 }
1723 else
1724 {
1725 // no destination buffer... convert using temp buffer
1726 // to calculate destination buffer requirement
1727 char tbuf[16];
1728 res = 0;
1729 do {
1730 buf = tbuf; outbuf = 16;
1731
1732 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1733
1734 res += 16 - outbuf;
1735 } while ((cres==(size_t)-1) && (errno==E2BIG));
1736 }
1737
1738 if (ms_wcNeedsSwap)
1739 {
1740 free(tmpbuf);
1741 }
1742
1743 if (ICONV_FAILED(cres, inbuf))
1744 {
1745 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1746 return (size_t)-1;
1747 }
1748
1749 return res;
1750 }
1751
1752 size_t wxMBConv_iconv::GetMBNulLen() const
1753 {
1754 if ( m_minMBCharWidth == 0 )
1755 {
1756 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1757
1758 #if wxUSE_THREADS
1759 // NB: explained in MB2WC
1760 wxMutexLocker lock(self->m_iconvMutex);
1761 #endif
1762
1763 wchar_t *wnul = L"";
1764 char buf[8]; // should be enough for NUL in any encoding
1765 size_t inLen = sizeof(wchar_t),
1766 outLen = WXSIZEOF(buf);
1767 char *in = (char *)wnul;
1768 char *out = buf;
1769 if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1770 {
1771 self->m_minMBCharWidth = (size_t)-1;
1772 }
1773 else // ok
1774 {
1775 self->m_minMBCharWidth = out - buf;
1776 }
1777 }
1778
1779 return m_minMBCharWidth;
1780 }
1781
1782 #endif // HAVE_ICONV
1783
1784
1785 // ============================================================================
1786 // Win32 conversion classes
1787 // ============================================================================
1788
1789 #ifdef wxHAVE_WIN32_MB2WC
1790
1791 // from utils.cpp
1792 #if wxUSE_FONTMAP
1793 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1794 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1795 #endif
1796
1797 class wxMBConv_win32 : public wxMBConv
1798 {
1799 public:
1800 wxMBConv_win32()
1801 {
1802 m_CodePage = CP_ACP;
1803 m_minMBCharWidth = 0;
1804 }
1805
1806 #if wxUSE_FONTMAP
1807 wxMBConv_win32(const wxChar* name)
1808 {
1809 m_CodePage = wxCharsetToCodepage(name);
1810 m_minMBCharWidth = 0;
1811 }
1812
1813 wxMBConv_win32(wxFontEncoding encoding)
1814 {
1815 m_CodePage = wxEncodingToCodepage(encoding);
1816 m_minMBCharWidth = 0;
1817 }
1818 #endif // wxUSE_FONTMAP
1819
1820 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1821 {
1822 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1823 // the behaviour is not compatible with the Unix version (using iconv)
1824 // and break the library itself, e.g. wxTextInputStream::NextChar()
1825 // wouldn't work if reading an incomplete MB char didn't result in an
1826 // error
1827 //
1828 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1829 // an error (tested under Windows Server 2003) and apparently it is
1830 // done on purpose, i.e. the function accepts any input in this case
1831 // and although I'd prefer to return error on ill-formed output, our
1832 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1833 // explicitly ill-formed according to RFC 2152) neither so we don't
1834 // even have any fallback here...
1835 //
1836 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1837 // Win XP or newer and if it is specified on older versions, conversion
1838 // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1839 // fails. So we can only use the flag on newer Windows versions.
1840 // Additionally, the flag is not supported by UTF7, symbol and CJK
1841 // encodings. See here:
1842 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1843 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1844 int flags = 0;
1845 if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1846 m_CodePage < 50000 &&
1847 IsAtLeastWin2kSP4() )
1848 {
1849 flags = MB_ERR_INVALID_CHARS;
1850 }
1851 else if ( m_CodePage == CP_UTF8 )
1852 {
1853 // Avoid round-trip in the special case of UTF-8 by using our
1854 // own UTF-8 conversion code:
1855 return wxMBConvUTF8().MB2WC(buf, psz, n);
1856 }
1857
1858 const size_t len = ::MultiByteToWideChar
1859 (
1860 m_CodePage, // code page
1861 flags, // flags: fall on error
1862 psz, // input string
1863 -1, // its length (NUL-terminated)
1864 buf, // output string
1865 buf ? n : 0 // size of output buffer
1866 );
1867 if ( !len )
1868 {
1869 // function totally failed
1870 return (size_t)-1;
1871 }
1872
1873 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1874 // check if we succeeded, by doing a double trip:
1875 if ( !flags && buf )
1876 {
1877 const size_t mbLen = strlen(psz);
1878 wxCharBuffer mbBuf(mbLen);
1879 if ( ::WideCharToMultiByte
1880 (
1881 m_CodePage,
1882 0,
1883 buf,
1884 -1,
1885 mbBuf.data(),
1886 mbLen + 1, // size in bytes, not length
1887 NULL,
1888 NULL
1889 ) == 0 ||
1890 strcmp(mbBuf, psz) != 0 )
1891 {
1892 // we didn't obtain the same thing we started from, hence
1893 // the conversion was lossy and we consider that it failed
1894 return (size_t)-1;
1895 }
1896 }
1897
1898 // note that it returns count of written chars for buf != NULL and size
1899 // of the needed buffer for buf == NULL so in either case the length of
1900 // the string (which never includes the terminating NUL) is one less
1901 return len - 1;
1902 }
1903
1904 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1905 {
1906 /*
1907 we have a problem here: by default, WideCharToMultiByte() may
1908 replace characters unrepresentable in the target code page with bad
1909 quality approximations such as turning "1/2" symbol (U+00BD) into
1910 "1" for the code pages which don't have it and we, obviously, want
1911 to avoid this at any price
1912
1913 the trouble is that this function does it _silently_, i.e. it won't
1914 even tell us whether it did or not... Win98/2000 and higher provide
1915 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1916 we have to resort to a round trip, i.e. check that converting back
1917 results in the same string -- this is, of course, expensive but
1918 otherwise we simply can't be sure to not garble the data.
1919 */
1920
1921 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1922 // it doesn't work with CJK encodings (which we test for rather roughly
1923 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1924 // supporting it
1925 BOOL usedDef wxDUMMY_INITIALIZE(false);
1926 BOOL *pUsedDef;
1927 int flags;
1928 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1929 {
1930 // it's our lucky day
1931 flags = WC_NO_BEST_FIT_CHARS;
1932 pUsedDef = &usedDef;
1933 }
1934 else // old system or unsupported encoding
1935 {
1936 flags = 0;
1937 pUsedDef = NULL;
1938 }
1939
1940 const size_t len = ::WideCharToMultiByte
1941 (
1942 m_CodePage, // code page
1943 flags, // either none or no best fit
1944 pwz, // input string
1945 -1, // it is (wide) NUL-terminated
1946 buf, // output buffer
1947 buf ? n : 0, // and its size
1948 NULL, // default "replacement" char
1949 pUsedDef // [out] was it used?
1950 );
1951
1952 if ( !len )
1953 {
1954 // function totally failed
1955 return (size_t)-1;
1956 }
1957
1958 // if we were really converting, check if we succeeded
1959 if ( buf )
1960 {
1961 if ( flags )
1962 {
1963 // check if the conversion failed, i.e. if any replacements
1964 // were done
1965 if ( usedDef )
1966 return (size_t)-1;
1967 }
1968 else // we must resort to double tripping...
1969 {
1970 wxWCharBuffer wcBuf(n);
1971 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1972 wcscmp(wcBuf, pwz) != 0 )
1973 {
1974 // we didn't obtain the same thing we started from, hence
1975 // the conversion was lossy and we consider that it failed
1976 return (size_t)-1;
1977 }
1978 }
1979 }
1980
1981 // see the comment above for the reason of "len - 1"
1982 return len - 1;
1983 }
1984
1985 virtual size_t GetMBNulLen() const
1986 {
1987 if ( m_minMBCharWidth == 0 )
1988 {
1989 int len = ::WideCharToMultiByte
1990 (
1991 m_CodePage, // code page
1992 0, // no flags
1993 L"", // input string
1994 1, // translate just the NUL
1995 NULL, // output buffer
1996 0, // and its size
1997 NULL, // no replacement char
1998 NULL // [out] don't care if it was used
1999 );
2000
2001 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2002 switch ( len )
2003 {
2004 default:
2005 wxLogDebug(_T("Unexpected NUL length %d"), len);
2006 // fall through
2007
2008 case 0:
2009 self->m_minMBCharWidth = (size_t)-1;
2010 break;
2011
2012 case 1:
2013 case 2:
2014 case 4:
2015 self->m_minMBCharWidth = len;
2016 break;
2017 }
2018 }
2019
2020 return m_minMBCharWidth;
2021 }
2022
2023 bool IsOk() const { return m_CodePage != -1; }
2024
2025 private:
2026 static bool CanUseNoBestFit()
2027 {
2028 static int s_isWin98Or2k = -1;
2029
2030 if ( s_isWin98Or2k == -1 )
2031 {
2032 int verMaj, verMin;
2033 switch ( wxGetOsVersion(&verMaj, &verMin) )
2034 {
2035 case wxWIN95:
2036 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2037 break;
2038
2039 case wxWINDOWS_NT:
2040 s_isWin98Or2k = verMaj >= 5;
2041 break;
2042
2043 default:
2044 // unknown, be conseravtive by default
2045 s_isWin98Or2k = 0;
2046 }
2047
2048 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2049 }
2050
2051 return s_isWin98Or2k == 1;
2052 }
2053
2054 static bool IsAtLeastWin2kSP4()
2055 {
2056 #ifdef __WXWINCE__
2057 return false;
2058 #else
2059 static int s_isAtLeastWin2kSP4 = -1;
2060
2061 if ( s_isAtLeastWin2kSP4 == -1 )
2062 {
2063 OSVERSIONINFOEX ver;
2064
2065 memset(&ver, 0, sizeof(ver));
2066 ver.dwOSVersionInfoSize = sizeof(ver);
2067 GetVersionEx((OSVERSIONINFO*)&ver);
2068
2069 s_isAtLeastWin2kSP4 =
2070 ((ver.dwMajorVersion > 5) || // Vista+
2071 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2072 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2073 ver.wServicePackMajor >= 4)) // 2000 SP4+
2074 ? 1 : 0;
2075 }
2076
2077 return s_isAtLeastWin2kSP4 == 1;
2078 #endif
2079 }
2080
2081
2082 // the code page we're working with
2083 long m_CodePage;
2084
2085 // cached result of GetMBNulLen(), set to 0 initially meaning
2086 // "unknown"
2087 size_t m_minMBCharWidth;
2088 };
2089
2090 #endif // wxHAVE_WIN32_MB2WC
2091
2092 // ============================================================================
2093 // Cocoa conversion classes
2094 // ============================================================================
2095
2096 #if defined(__WXCOCOA__)
2097
2098 // RN: There is no UTF-32 support in either Core Foundation or
2099 // Cocoa. Strangely enough, internally Core Foundation uses
2100 // UTF 32 internally quite a bit - its just not public (yet).
2101
2102 #include <CoreFoundation/CFString.h>
2103 #include <CoreFoundation/CFStringEncodingExt.h>
2104
2105 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2106 {
2107 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2108 if ( encoding == wxFONTENCODING_DEFAULT )
2109 {
2110 enc = CFStringGetSystemEncoding();
2111 }
2112 else switch( encoding)
2113 {
2114 case wxFONTENCODING_ISO8859_1 :
2115 enc = kCFStringEncodingISOLatin1 ;
2116 break ;
2117 case wxFONTENCODING_ISO8859_2 :
2118 enc = kCFStringEncodingISOLatin2;
2119 break ;
2120 case wxFONTENCODING_ISO8859_3 :
2121 enc = kCFStringEncodingISOLatin3 ;
2122 break ;
2123 case wxFONTENCODING_ISO8859_4 :
2124 enc = kCFStringEncodingISOLatin4;
2125 break ;
2126 case wxFONTENCODING_ISO8859_5 :
2127 enc = kCFStringEncodingISOLatinCyrillic;
2128 break ;
2129 case wxFONTENCODING_ISO8859_6 :
2130 enc = kCFStringEncodingISOLatinArabic;
2131 break ;
2132 case wxFONTENCODING_ISO8859_7 :
2133 enc = kCFStringEncodingISOLatinGreek;
2134 break ;
2135 case wxFONTENCODING_ISO8859_8 :
2136 enc = kCFStringEncodingISOLatinHebrew;
2137 break ;
2138 case wxFONTENCODING_ISO8859_9 :
2139 enc = kCFStringEncodingISOLatin5;
2140 break ;
2141 case wxFONTENCODING_ISO8859_10 :
2142 enc = kCFStringEncodingISOLatin6;
2143 break ;
2144 case wxFONTENCODING_ISO8859_11 :
2145 enc = kCFStringEncodingISOLatinThai;
2146 break ;
2147 case wxFONTENCODING_ISO8859_13 :
2148 enc = kCFStringEncodingISOLatin7;
2149 break ;
2150 case wxFONTENCODING_ISO8859_14 :
2151 enc = kCFStringEncodingISOLatin8;
2152 break ;
2153 case wxFONTENCODING_ISO8859_15 :
2154 enc = kCFStringEncodingISOLatin9;
2155 break ;
2156
2157 case wxFONTENCODING_KOI8 :
2158 enc = kCFStringEncodingKOI8_R;
2159 break ;
2160 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2161 enc = kCFStringEncodingDOSRussian;
2162 break ;
2163
2164 // case wxFONTENCODING_BULGARIAN :
2165 // enc = ;
2166 // break ;
2167
2168 case wxFONTENCODING_CP437 :
2169 enc =kCFStringEncodingDOSLatinUS ;
2170 break ;
2171 case wxFONTENCODING_CP850 :
2172 enc = kCFStringEncodingDOSLatin1;
2173 break ;
2174 case wxFONTENCODING_CP852 :
2175 enc = kCFStringEncodingDOSLatin2;
2176 break ;
2177 case wxFONTENCODING_CP855 :
2178 enc = kCFStringEncodingDOSCyrillic;
2179 break ;
2180 case wxFONTENCODING_CP866 :
2181 enc =kCFStringEncodingDOSRussian ;
2182 break ;
2183 case wxFONTENCODING_CP874 :
2184 enc = kCFStringEncodingDOSThai;
2185 break ;
2186 case wxFONTENCODING_CP932 :
2187 enc = kCFStringEncodingDOSJapanese;
2188 break ;
2189 case wxFONTENCODING_CP936 :
2190 enc =kCFStringEncodingDOSChineseSimplif ;
2191 break ;
2192 case wxFONTENCODING_CP949 :
2193 enc = kCFStringEncodingDOSKorean;
2194 break ;
2195 case wxFONTENCODING_CP950 :
2196 enc = kCFStringEncodingDOSChineseTrad;
2197 break ;
2198 case wxFONTENCODING_CP1250 :
2199 enc = kCFStringEncodingWindowsLatin2;
2200 break ;
2201 case wxFONTENCODING_CP1251 :
2202 enc =kCFStringEncodingWindowsCyrillic ;
2203 break ;
2204 case wxFONTENCODING_CP1252 :
2205 enc =kCFStringEncodingWindowsLatin1 ;
2206 break ;
2207 case wxFONTENCODING_CP1253 :
2208 enc = kCFStringEncodingWindowsGreek;
2209 break ;
2210 case wxFONTENCODING_CP1254 :
2211 enc = kCFStringEncodingWindowsLatin5;
2212 break ;
2213 case wxFONTENCODING_CP1255 :
2214 enc =kCFStringEncodingWindowsHebrew ;
2215 break ;
2216 case wxFONTENCODING_CP1256 :
2217 enc =kCFStringEncodingWindowsArabic ;
2218 break ;
2219 case wxFONTENCODING_CP1257 :
2220 enc = kCFStringEncodingWindowsBalticRim;
2221 break ;
2222 // This only really encodes to UTF7 (if that) evidently
2223 // case wxFONTENCODING_UTF7 :
2224 // enc = kCFStringEncodingNonLossyASCII ;
2225 // break ;
2226 case wxFONTENCODING_UTF8 :
2227 enc = kCFStringEncodingUTF8 ;
2228 break ;
2229 case wxFONTENCODING_EUC_JP :
2230 enc = kCFStringEncodingEUC_JP;
2231 break ;
2232 case wxFONTENCODING_UTF16 :
2233 enc = kCFStringEncodingUnicode ;
2234 break ;
2235 case wxFONTENCODING_MACROMAN :
2236 enc = kCFStringEncodingMacRoman ;
2237 break ;
2238 case wxFONTENCODING_MACJAPANESE :
2239 enc = kCFStringEncodingMacJapanese ;
2240 break ;
2241 case wxFONTENCODING_MACCHINESETRAD :
2242 enc = kCFStringEncodingMacChineseTrad ;
2243 break ;
2244 case wxFONTENCODING_MACKOREAN :
2245 enc = kCFStringEncodingMacKorean ;
2246 break ;
2247 case wxFONTENCODING_MACARABIC :
2248 enc = kCFStringEncodingMacArabic ;
2249 break ;
2250 case wxFONTENCODING_MACHEBREW :
2251 enc = kCFStringEncodingMacHebrew ;
2252 break ;
2253 case wxFONTENCODING_MACGREEK :
2254 enc = kCFStringEncodingMacGreek ;
2255 break ;
2256 case wxFONTENCODING_MACCYRILLIC :
2257 enc = kCFStringEncodingMacCyrillic ;
2258 break ;
2259 case wxFONTENCODING_MACDEVANAGARI :
2260 enc = kCFStringEncodingMacDevanagari ;
2261 break ;
2262 case wxFONTENCODING_MACGURMUKHI :
2263 enc = kCFStringEncodingMacGurmukhi ;
2264 break ;
2265 case wxFONTENCODING_MACGUJARATI :
2266 enc = kCFStringEncodingMacGujarati ;
2267 break ;
2268 case wxFONTENCODING_MACORIYA :
2269 enc = kCFStringEncodingMacOriya ;
2270 break ;
2271 case wxFONTENCODING_MACBENGALI :
2272 enc = kCFStringEncodingMacBengali ;
2273 break ;
2274 case wxFONTENCODING_MACTAMIL :
2275 enc = kCFStringEncodingMacTamil ;
2276 break ;
2277 case wxFONTENCODING_MACTELUGU :
2278 enc = kCFStringEncodingMacTelugu ;
2279 break ;
2280 case wxFONTENCODING_MACKANNADA :
2281 enc = kCFStringEncodingMacKannada ;
2282 break ;
2283 case wxFONTENCODING_MACMALAJALAM :
2284 enc = kCFStringEncodingMacMalayalam ;
2285 break ;
2286 case wxFONTENCODING_MACSINHALESE :
2287 enc = kCFStringEncodingMacSinhalese ;
2288 break ;
2289 case wxFONTENCODING_MACBURMESE :
2290 enc = kCFStringEncodingMacBurmese ;
2291 break ;
2292 case wxFONTENCODING_MACKHMER :
2293 enc = kCFStringEncodingMacKhmer ;
2294 break ;
2295 case wxFONTENCODING_MACTHAI :
2296 enc = kCFStringEncodingMacThai ;
2297 break ;
2298 case wxFONTENCODING_MACLAOTIAN :
2299 enc = kCFStringEncodingMacLaotian ;
2300 break ;
2301 case wxFONTENCODING_MACGEORGIAN :
2302 enc = kCFStringEncodingMacGeorgian ;
2303 break ;
2304 case wxFONTENCODING_MACARMENIAN :
2305 enc = kCFStringEncodingMacArmenian ;
2306 break ;
2307 case wxFONTENCODING_MACCHINESESIMP :
2308 enc = kCFStringEncodingMacChineseSimp ;
2309 break ;
2310 case wxFONTENCODING_MACTIBETAN :
2311 enc = kCFStringEncodingMacTibetan ;
2312 break ;
2313 case wxFONTENCODING_MACMONGOLIAN :
2314 enc = kCFStringEncodingMacMongolian ;
2315 break ;
2316 case wxFONTENCODING_MACETHIOPIC :
2317 enc = kCFStringEncodingMacEthiopic ;
2318 break ;
2319 case wxFONTENCODING_MACCENTRALEUR :
2320 enc = kCFStringEncodingMacCentralEurRoman ;
2321 break ;
2322 case wxFONTENCODING_MACVIATNAMESE :
2323 enc = kCFStringEncodingMacVietnamese ;
2324 break ;
2325 case wxFONTENCODING_MACARABICEXT :
2326 enc = kCFStringEncodingMacExtArabic ;
2327 break ;
2328 case wxFONTENCODING_MACSYMBOL :
2329 enc = kCFStringEncodingMacSymbol ;
2330 break ;
2331 case wxFONTENCODING_MACDINGBATS :
2332 enc = kCFStringEncodingMacDingbats ;
2333 break ;
2334 case wxFONTENCODING_MACTURKISH :
2335 enc = kCFStringEncodingMacTurkish ;
2336 break ;
2337 case wxFONTENCODING_MACCROATIAN :
2338 enc = kCFStringEncodingMacCroatian ;
2339 break ;
2340 case wxFONTENCODING_MACICELANDIC :
2341 enc = kCFStringEncodingMacIcelandic ;
2342 break ;
2343 case wxFONTENCODING_MACROMANIAN :
2344 enc = kCFStringEncodingMacRomanian ;
2345 break ;
2346 case wxFONTENCODING_MACCELTIC :
2347 enc = kCFStringEncodingMacCeltic ;
2348 break ;
2349 case wxFONTENCODING_MACGAELIC :
2350 enc = kCFStringEncodingMacGaelic ;
2351 break ;
2352 // case wxFONTENCODING_MACKEYBOARD :
2353 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2354 // break ;
2355 default :
2356 // because gcc is picky
2357 break ;
2358 } ;
2359 return enc ;
2360 }
2361
2362 class wxMBConv_cocoa : public wxMBConv
2363 {
2364 public:
2365 wxMBConv_cocoa()
2366 {
2367 Init(CFStringGetSystemEncoding()) ;
2368 }
2369
2370 #if wxUSE_FONTMAP
2371 wxMBConv_cocoa(const wxChar* name)
2372 {
2373 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2374 }
2375 #endif
2376
2377 wxMBConv_cocoa(wxFontEncoding encoding)
2378 {
2379 Init( wxCFStringEncFromFontEnc(encoding) );
2380 }
2381
2382 ~wxMBConv_cocoa()
2383 {
2384 }
2385
2386 void Init( CFStringEncoding encoding)
2387 {
2388 m_encoding = encoding ;
2389 }
2390
2391 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2392 {
2393 wxASSERT(szUnConv);
2394
2395 CFStringRef theString = CFStringCreateWithBytes (
2396 NULL, //the allocator
2397 (const UInt8*)szUnConv,
2398 strlen(szUnConv),
2399 m_encoding,
2400 false //no BOM/external representation
2401 );
2402
2403 wxASSERT(theString);
2404
2405 size_t nOutLength = CFStringGetLength(theString);
2406
2407 if (szOut == NULL)
2408 {
2409 CFRelease(theString);
2410 return nOutLength;
2411 }
2412
2413 CFRange theRange = { 0, nOutSize };
2414
2415 #if SIZEOF_WCHAR_T == 4
2416 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2417 #endif
2418
2419 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2420
2421 CFRelease(theString);
2422
2423 szUniCharBuffer[nOutLength] = '\0' ;
2424
2425 #if SIZEOF_WCHAR_T == 4
2426 wxMBConvUTF16 converter ;
2427 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2428 delete[] szUniCharBuffer;
2429 #endif
2430
2431 return nOutLength;
2432 }
2433
2434 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2435 {
2436 wxASSERT(szUnConv);
2437
2438 size_t nRealOutSize;
2439 size_t nBufSize = wxWcslen(szUnConv);
2440 UniChar* szUniBuffer = (UniChar*) szUnConv;
2441
2442 #if SIZEOF_WCHAR_T == 4
2443 wxMBConvUTF16 converter ;
2444 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2445 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2446 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2447 nBufSize /= sizeof(UniChar);
2448 #endif
2449
2450 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2451 NULL, //allocator
2452 szUniBuffer,
2453 nBufSize,
2454 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2455 );
2456
2457 wxASSERT(theString);
2458
2459 //Note that CER puts a BOM when converting to unicode
2460 //so we check and use getchars instead in that case
2461 if (m_encoding == kCFStringEncodingUnicode)
2462 {
2463 if (szOut != NULL)
2464 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2465
2466 nRealOutSize = CFStringGetLength(theString) + 1;
2467 }
2468 else
2469 {
2470 CFStringGetBytes(
2471 theString,
2472 CFRangeMake(0, CFStringGetLength(theString)),
2473 m_encoding,
2474 0, //what to put in characters that can't be converted -
2475 //0 tells CFString to return NULL if it meets such a character
2476 false, //not an external representation
2477 (UInt8*) szOut,
2478 nOutSize,
2479 (CFIndex*) &nRealOutSize
2480 );
2481 }
2482
2483 CFRelease(theString);
2484
2485 #if SIZEOF_WCHAR_T == 4
2486 delete[] szUniBuffer;
2487 #endif
2488
2489 return nRealOutSize - 1;
2490 }
2491
2492 bool IsOk() const
2493 {
2494 return m_encoding != kCFStringEncodingInvalidId &&
2495 CFStringIsEncodingAvailable(m_encoding);
2496 }
2497
2498 private:
2499 CFStringEncoding m_encoding ;
2500 };
2501
2502 #endif // defined(__WXCOCOA__)
2503
2504 // ============================================================================
2505 // Mac conversion classes
2506 // ============================================================================
2507
2508 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2509
2510 class wxMBConv_mac : public wxMBConv
2511 {
2512 public:
2513 wxMBConv_mac()
2514 {
2515 Init(CFStringGetSystemEncoding()) ;
2516 }
2517
2518 #if wxUSE_FONTMAP
2519 wxMBConv_mac(const wxChar* name)
2520 {
2521 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2522 }
2523 #endif
2524
2525 wxMBConv_mac(wxFontEncoding encoding)
2526 {
2527 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2528 }
2529
2530 ~wxMBConv_mac()
2531 {
2532 OSStatus status = noErr ;
2533 status = TECDisposeConverter(m_MB2WC_converter);
2534 status = TECDisposeConverter(m_WC2MB_converter);
2535 }
2536
2537
2538 void Init( TextEncodingBase encoding)
2539 {
2540 OSStatus status = noErr ;
2541 m_char_encoding = encoding ;
2542 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2543
2544 status = TECCreateConverter(&m_MB2WC_converter,
2545 m_char_encoding,
2546 m_unicode_encoding);
2547 status = TECCreateConverter(&m_WC2MB_converter,
2548 m_unicode_encoding,
2549 m_char_encoding);
2550 }
2551
2552 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2553 {
2554 OSStatus status = noErr ;
2555 ByteCount byteOutLen ;
2556 ByteCount byteInLen = strlen(psz) ;
2557 wchar_t *tbuf = NULL ;
2558 UniChar* ubuf = NULL ;
2559 size_t res = 0 ;
2560
2561 if (buf == NULL)
2562 {
2563 //apple specs say at least 32
2564 n = wxMax( 32 , byteInLen ) ;
2565 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2566 }
2567 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2568 #if SIZEOF_WCHAR_T == 4
2569 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2570 #else
2571 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2572 #endif
2573 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2574 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2575 #if SIZEOF_WCHAR_T == 4
2576 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2577 // is not properly terminated we get random characters at the end
2578 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2579 wxMBConvUTF16 converter ;
2580 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2581 free( ubuf ) ;
2582 #else
2583 res = byteOutLen / sizeof( UniChar ) ;
2584 #endif
2585 if ( buf == NULL )
2586 free(tbuf) ;
2587
2588 if ( buf && res < n)
2589 buf[res] = 0;
2590
2591 return res ;
2592 }
2593
2594 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2595 {
2596 OSStatus status = noErr ;
2597 ByteCount byteOutLen ;
2598 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2599
2600 char *tbuf = NULL ;
2601
2602 if (buf == NULL)
2603 {
2604 //apple specs say at least 32
2605 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2606 tbuf = (char*) malloc( n ) ;
2607 }
2608
2609 ByteCount byteBufferLen = n ;
2610 UniChar* ubuf = NULL ;
2611 #if SIZEOF_WCHAR_T == 4
2612 wxMBConvUTF16 converter ;
2613 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2614 byteInLen = unicharlen ;
2615 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2616 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2617 #else
2618 ubuf = (UniChar*) psz ;
2619 #endif
2620 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2621 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2622 #if SIZEOF_WCHAR_T == 4
2623 free( ubuf ) ;
2624 #endif
2625 if ( buf == NULL )
2626 free(tbuf) ;
2627
2628 size_t res = byteOutLen ;
2629 if ( buf && res < n)
2630 {
2631 buf[res] = 0;
2632
2633 //we need to double-trip to verify it didn't insert any ? in place
2634 //of bogus characters
2635 wxWCharBuffer wcBuf(n);
2636 size_t pszlen = wxWcslen(psz);
2637 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2638 wxWcslen(wcBuf) != pszlen ||
2639 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2640 {
2641 // we didn't obtain the same thing we started from, hence
2642 // the conversion was lossy and we consider that it failed
2643 return (size_t)-1;
2644 }
2645 }
2646
2647 return res ;
2648 }
2649
2650 bool IsOk() const
2651 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2652
2653 private:
2654 TECObjectRef m_MB2WC_converter ;
2655 TECObjectRef m_WC2MB_converter ;
2656
2657 TextEncodingBase m_char_encoding ;
2658 TextEncodingBase m_unicode_encoding ;
2659 };
2660
2661 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2662
2663 // ============================================================================
2664 // wxEncodingConverter based conversion classes
2665 // ============================================================================
2666
2667 #if wxUSE_FONTMAP
2668
2669 class wxMBConv_wxwin : public wxMBConv
2670 {
2671 private:
2672 void Init()
2673 {
2674 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2675 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2676 }
2677
2678 public:
2679 // temporarily just use wxEncodingConverter stuff,
2680 // so that it works while a better implementation is built
2681 wxMBConv_wxwin(const wxChar* name)
2682 {
2683 if (name)
2684 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2685 else
2686 m_enc = wxFONTENCODING_SYSTEM;
2687
2688 Init();
2689 }
2690
2691 wxMBConv_wxwin(wxFontEncoding enc)
2692 {
2693 m_enc = enc;
2694
2695 Init();
2696 }
2697
2698 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2699 {
2700 size_t inbuf = strlen(psz);
2701 if (buf)
2702 {
2703 if (!m2w.Convert(psz,buf))
2704 return (size_t)-1;
2705 }
2706 return inbuf;
2707 }
2708
2709 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2710 {
2711 const size_t inbuf = wxWcslen(psz);
2712 if (buf)
2713 {
2714 if (!w2m.Convert(psz,buf))
2715 return (size_t)-1;
2716 }
2717
2718 return inbuf;
2719 }
2720
2721 virtual size_t GetMBNulLen() const
2722 {
2723 switch ( m_enc )
2724 {
2725 case wxFONTENCODING_UTF16BE:
2726 case wxFONTENCODING_UTF16LE:
2727 return 2;
2728
2729 case wxFONTENCODING_UTF32BE:
2730 case wxFONTENCODING_UTF32LE:
2731 return 4;
2732
2733 default:
2734 return 1;
2735 }
2736 }
2737
2738 bool IsOk() const { return m_ok; }
2739
2740 public:
2741 wxFontEncoding m_enc;
2742 wxEncodingConverter m2w, w2m;
2743
2744 private:
2745 // were we initialized successfully?
2746 bool m_ok;
2747
2748 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2749 };
2750
2751 // make the constructors available for unit testing
2752 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2753 {
2754 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2755 if ( !result->IsOk() )
2756 {
2757 delete result;
2758 return 0;
2759 }
2760 return result;
2761 }
2762
2763 #endif // wxUSE_FONTMAP
2764
2765 // ============================================================================
2766 // wxCSConv implementation
2767 // ============================================================================
2768
2769 void wxCSConv::Init()
2770 {
2771 m_name = NULL;
2772 m_convReal = NULL;
2773 m_deferred = true;
2774 }
2775
2776 wxCSConv::wxCSConv(const wxChar *charset)
2777 {
2778 Init();
2779
2780 if ( charset )
2781 {
2782 SetName(charset);
2783 }
2784
2785 #if wxUSE_FONTMAP
2786 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2787 #else
2788 m_encoding = wxFONTENCODING_SYSTEM;
2789 #endif
2790 }
2791
2792 wxCSConv::wxCSConv(wxFontEncoding encoding)
2793 {
2794 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2795 {
2796 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2797
2798 encoding = wxFONTENCODING_SYSTEM;
2799 }
2800
2801 Init();
2802
2803 m_encoding = encoding;
2804 }
2805
2806 wxCSConv::~wxCSConv()
2807 {
2808 Clear();
2809 }
2810
2811 wxCSConv::wxCSConv(const wxCSConv& conv)
2812 : wxMBConv()
2813 {
2814 Init();
2815
2816 SetName(conv.m_name);
2817 m_encoding = conv.m_encoding;
2818 }
2819
2820 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2821 {
2822 Clear();
2823
2824 SetName(conv.m_name);
2825 m_encoding = conv.m_encoding;
2826
2827 return *this;
2828 }
2829
2830 void wxCSConv::Clear()
2831 {
2832 free(m_name);
2833 delete m_convReal;
2834
2835 m_name = NULL;
2836 m_convReal = NULL;
2837 }
2838
2839 void wxCSConv::SetName(const wxChar *charset)
2840 {
2841 if (charset)
2842 {
2843 m_name = wxStrdup(charset);
2844 m_deferred = true;
2845 }
2846 }
2847
2848 #if wxUSE_FONTMAP
2849 #include "wx/hashmap.h"
2850
2851 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2852 wxEncodingNameCache );
2853
2854 static wxEncodingNameCache gs_nameCache;
2855 #endif
2856
2857 wxMBConv *wxCSConv::DoCreate() const
2858 {
2859 #if wxUSE_FONTMAP
2860 wxLogTrace(TRACE_STRCONV,
2861 wxT("creating conversion for %s"),
2862 (m_name ? m_name
2863 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2864 #endif // wxUSE_FONTMAP
2865
2866 // check for the special case of ASCII or ISO8859-1 charset: as we have
2867 // special knowledge of it anyhow, we don't need to create a special
2868 // conversion object
2869 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2870 m_encoding == wxFONTENCODING_DEFAULT )
2871 {
2872 // don't convert at all
2873 return NULL;
2874 }
2875
2876 // we trust OS to do conversion better than we can so try external
2877 // conversion methods first
2878 //
2879 // the full order is:
2880 // 1. OS conversion (iconv() under Unix or Win32 API)
2881 // 2. hard coded conversions for UTF
2882 // 3. wxEncodingConverter as fall back
2883
2884 // step (1)
2885 #ifdef HAVE_ICONV
2886 #if !wxUSE_FONTMAP
2887 if ( m_name )
2888 #endif // !wxUSE_FONTMAP
2889 {
2890 wxString name(m_name);
2891 wxFontEncoding encoding(m_encoding);
2892
2893 if ( !name.empty() )
2894 {
2895 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2896 if ( conv->IsOk() )
2897 return conv;
2898
2899 delete conv;
2900
2901 #if wxUSE_FONTMAP
2902 encoding =
2903 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2904 #endif // wxUSE_FONTMAP
2905 }
2906 #if wxUSE_FONTMAP
2907 {
2908 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2909 if ( it != gs_nameCache.end() )
2910 {
2911 if ( it->second.empty() )
2912 return NULL;
2913
2914 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2915 if ( conv->IsOk() )
2916 return conv;
2917
2918 delete conv;
2919 }
2920
2921 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2922
2923 for ( ; *names; ++names )
2924 {
2925 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2926 if ( conv->IsOk() )
2927 {
2928 gs_nameCache[encoding] = *names;
2929 return conv;
2930 }
2931
2932 delete conv;
2933 }
2934
2935 gs_nameCache[encoding] = _T(""); // cache the failure
2936 }
2937 #endif // wxUSE_FONTMAP
2938 }
2939 #endif // HAVE_ICONV
2940
2941 #ifdef wxHAVE_WIN32_MB2WC
2942 {
2943 #if wxUSE_FONTMAP
2944 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2945 : new wxMBConv_win32(m_encoding);
2946 if ( conv->IsOk() )
2947 return conv;
2948
2949 delete conv;
2950 #else
2951 return NULL;
2952 #endif
2953 }
2954 #endif // wxHAVE_WIN32_MB2WC
2955 #if defined(__WXMAC__)
2956 {
2957 // leave UTF16 and UTF32 to the built-ins of wx
2958 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2959 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2960 {
2961
2962 #if wxUSE_FONTMAP
2963 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2964 : new wxMBConv_mac(m_encoding);
2965 #else
2966 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2967 #endif
2968 if ( conv->IsOk() )
2969 return conv;
2970
2971 delete conv;
2972 }
2973 }
2974 #endif
2975 #if defined(__WXCOCOA__)
2976 {
2977 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2978 {
2979
2980 #if wxUSE_FONTMAP
2981 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2982 : new wxMBConv_cocoa(m_encoding);
2983 #else
2984 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2985 #endif
2986 if ( conv->IsOk() )
2987 return conv;
2988
2989 delete conv;
2990 }
2991 }
2992 #endif
2993 // step (2)
2994 wxFontEncoding enc = m_encoding;
2995 #if wxUSE_FONTMAP
2996 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2997 {
2998 // use "false" to suppress interactive dialogs -- we can be called from
2999 // anywhere and popping up a dialog from here is the last thing we want to
3000 // do
3001 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3002 }
3003 #endif // wxUSE_FONTMAP
3004
3005 switch ( enc )
3006 {
3007 case wxFONTENCODING_UTF7:
3008 return new wxMBConvUTF7;
3009
3010 case wxFONTENCODING_UTF8:
3011 return new wxMBConvUTF8;
3012
3013 case wxFONTENCODING_UTF16BE:
3014 return new wxMBConvUTF16BE;
3015
3016 case wxFONTENCODING_UTF16LE:
3017 return new wxMBConvUTF16LE;
3018
3019 case wxFONTENCODING_UTF32BE:
3020 return new wxMBConvUTF32BE;
3021
3022 case wxFONTENCODING_UTF32LE:
3023 return new wxMBConvUTF32LE;
3024
3025 default:
3026 // nothing to do but put here to suppress gcc warnings
3027 ;
3028 }
3029
3030 // step (3)
3031 #if wxUSE_FONTMAP
3032 {
3033 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3034 : new wxMBConv_wxwin(m_encoding);
3035 if ( conv->IsOk() )
3036 return conv;
3037
3038 delete conv;
3039 }
3040 #endif // wxUSE_FONTMAP
3041
3042 // NB: This is a hack to prevent deadlock. What could otherwise happen
3043 // in Unicode build: wxConvLocal creation ends up being here
3044 // because of some failure and logs the error. But wxLog will try to
3045 // attach timestamp, for which it will need wxConvLocal (to convert
3046 // time to char* and then wchar_t*), but that fails, tries to log
3047 // error, but wxLog has a (already locked) critical section that
3048 // guards static buffer.
3049 static bool alreadyLoggingError = false;
3050 if (!alreadyLoggingError)
3051 {
3052 alreadyLoggingError = true;
3053 wxLogError(_("Cannot convert from the charset '%s'!"),
3054 m_name ? m_name
3055 :
3056 #if wxUSE_FONTMAP
3057 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3058 #else // !wxUSE_FONTMAP
3059 wxString::Format(_("encoding %s"), m_encoding).c_str()
3060 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3061 );
3062 alreadyLoggingError = false;
3063 }
3064
3065 return NULL;
3066 }
3067
3068 void wxCSConv::CreateConvIfNeeded() const
3069 {
3070 if ( m_deferred )
3071 {
3072 wxCSConv *self = (wxCSConv *)this; // const_cast
3073
3074 #if wxUSE_INTL
3075 // if we don't have neither the name nor the encoding, use the default
3076 // encoding for this system
3077 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3078 {
3079 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3080 }
3081 #endif // wxUSE_INTL
3082
3083 self->m_convReal = DoCreate();
3084 self->m_deferred = false;
3085 }
3086 }
3087
3088 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3089 {
3090 CreateConvIfNeeded();
3091
3092 if (m_convReal)
3093 return m_convReal->MB2WC(buf, psz, n);
3094
3095 // latin-1 (direct)
3096 size_t len = strlen(psz);
3097
3098 if (buf)
3099 {
3100 for (size_t c = 0; c <= len; c++)
3101 buf[c] = (unsigned char)(psz[c]);
3102 }
3103
3104 return len;
3105 }
3106
3107 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3108 {
3109 CreateConvIfNeeded();
3110
3111 if (m_convReal)
3112 return m_convReal->WC2MB(buf, psz, n);
3113
3114 // latin-1 (direct)
3115 const size_t len = wxWcslen(psz);
3116 if (buf)
3117 {
3118 for (size_t c = 0; c <= len; c++)
3119 {
3120 if (psz[c] > 0xFF)
3121 return (size_t)-1;
3122 buf[c] = (char)psz[c];
3123 }
3124 }
3125 else
3126 {
3127 for (size_t c = 0; c <= len; c++)
3128 {
3129 if (psz[c] > 0xFF)
3130 return (size_t)-1;
3131 }
3132 }
3133
3134 return len;
3135 }
3136
3137 size_t wxCSConv::GetMBNulLen() const
3138 {
3139 CreateConvIfNeeded();
3140
3141 if ( m_convReal )
3142 {
3143 return m_convReal->GetMBNulLen();
3144 }
3145
3146 return 1;
3147 }
3148
3149 // ----------------------------------------------------------------------------
3150 // globals
3151 // ----------------------------------------------------------------------------
3152
3153 #ifdef __WINDOWS__
3154 static wxMBConv_win32 wxConvLibcObj;
3155 #elif defined(__WXMAC__) && !defined(__MACH__)
3156 static wxMBConv_mac wxConvLibcObj ;
3157 #else
3158 static wxMBConvLibc wxConvLibcObj;
3159 #endif
3160
3161 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3162 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3163 static wxMBConvUTF7 wxConvUTF7Obj;
3164 static wxMBConvUTF8 wxConvUTF8Obj;
3165
3166 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3167 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3168 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3169 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3170 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3171 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3172 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3173 #ifdef __WXOSX__
3174 wxConvUTF8Obj;
3175 #else
3176 wxConvLibcObj;
3177 #endif
3178
3179
3180 #else // !wxUSE_WCHAR_T
3181
3182 // stand-ins in absence of wchar_t
3183 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3184 wxConvISO8859_1,
3185 wxConvLocal,
3186 wxConvUTF8;
3187
3188 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T