]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
1. use a manifest constant wxNO_LEN instead of -1 for lengths everywhere
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
25
26 #ifdef __BORLANDC__
27 #pragma hdrstop
28 #endif
29
30 #ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33 #endif // WX_PRECOMP
34
35 #include "wx/strconv.h"
36
37 #if wxUSE_WCHAR_T
38
39 #ifdef __WINDOWS__
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #endif
43
44 #ifndef __WXWINCE__
45 #include <errno.h>
46 #endif
47
48 #include <ctype.h>
49 #include <string.h>
50 #include <stdlib.h>
51
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
55
56 #ifdef __SALFORDC__
57 #include <clib.h>
58 #endif
59
60 #ifdef HAVE_ICONV
61 #include <iconv.h>
62 #include "wx/thread.h"
63 #endif
64
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
67 #include "wx/utils.h"
68
69 #ifdef __WXMAC__
70 #ifndef __DARWIN__
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
74 #endif
75
76 #include "wx/mac/private.h" // includes mac headers
77 #endif
78
79 #define TRACE_STRCONV _T("strconv")
80
81 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
82 // be 4 bytes
83 #if SIZEOF_WCHAR_T == 2
84 #define WC_UTF16
85 #endif
86
87 // ============================================================================
88 // implementation
89 // ============================================================================
90
91 // helper function of cMB2WC(): check if n bytes at this location are all NUL
92 static bool NotAllNULs(const char *p, size_t n)
93 {
94 while ( n && *p++ == '\0' )
95 n--;
96
97 return n != 0;
98 }
99
100 // ----------------------------------------------------------------------------
101 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
102 // ----------------------------------------------------------------------------
103
104 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
105 {
106 if (input<=0xffff)
107 {
108 if (output)
109 *output = (wxUint16) input;
110 return 1;
111 }
112 else if (input>=0x110000)
113 {
114 return wxCONV_FAILED;
115 }
116 else
117 {
118 if (output)
119 {
120 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
121 *output = (wxUint16) ((input&0x3ff)+0xdc00);
122 }
123 return 2;
124 }
125 }
126
127 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
128 {
129 if ((*input<0xd800) || (*input>0xdfff))
130 {
131 output = *input;
132 return 1;
133 }
134 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
135 {
136 output = *input;
137 return wxCONV_FAILED;
138 }
139 else
140 {
141 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
142 return 2;
143 }
144 }
145
146 #ifdef WC_UTF16
147
148 // returns the next UTF-32 character from the wchar_t buffer and advances the
149 // pointer to the character after this one
150 //
151 // if an invalid character is found, *pSrc is set to NULL, the caller must
152 // check for this
153 static wxUint32 wxDecodeSurrogate(const wchar_t **pSrc)
154 {
155 wxUint32 out;
156 const size_t n = decode_utf16(*pSrc, out);
157 if ( n == wxCONV_FAILED )
158 *pSrc = NULL;
159 else
160 *pSrc += n;
161
162 return out;
163 }
164
165 #endif // WC_UTF16
166
167 // ----------------------------------------------------------------------------
168 // wxMBConv
169 // ----------------------------------------------------------------------------
170
171 size_t
172 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
173 const char *src, size_t srcLen) const
174 {
175 // although new conversion classes are supposed to implement this function
176 // directly, the existins ones only implement the old MB2WC() and so, to
177 // avoid to have to rewrite all conversion classes at once, we provide a
178 // default (but not efficient) implementation of this one in terms of the
179 // old function by copying the input to ensure that it's NUL-terminated and
180 // then using MB2WC() to convert it
181
182 // the number of chars [which would be] written to dst [if it were not NULL]
183 size_t dstWritten = 0;
184
185 // the number of NULs terminating this string
186 size_t nulLen wxDUMMY_INITIALIZE(0);
187
188 // if we were not given the input size we just have to assume that the
189 // string is properly terminated as we have no way of knowing how long it
190 // is anyhow, but if we do have the size check whether there are enough
191 // NULs at the end
192 wxCharBuffer bufTmp;
193 const char *srcEnd;
194 if ( srcLen != wxNO_LEN )
195 {
196 // we need to know how to find the end of this string
197 nulLen = GetMBNulLen();
198 if ( nulLen == wxCONV_FAILED )
199 return wxCONV_FAILED;
200
201 // if there are enough NULs we can avoid the copy
202 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
203 {
204 // make a copy in order to properly NUL-terminate the string
205 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
206 char * const p = bufTmp.data();
207 memcpy(p, src, srcLen);
208 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
209 *s = '\0';
210
211 src = bufTmp;
212 }
213
214 srcEnd = src + srcLen;
215 }
216 else // quit after the first loop iteration
217 {
218 srcEnd = NULL;
219 }
220
221 for ( ;; )
222 {
223 // try to convert the current chunk
224 size_t lenChunk = MB2WC(NULL, src, 0);
225 if ( lenChunk == wxCONV_FAILED )
226 return wxCONV_FAILED;
227
228 lenChunk++; // for the L'\0' at the end of this chunk
229
230 dstWritten += lenChunk;
231
232 if ( lenChunk == 1 )
233 {
234 // nothing left in the input string, conversion succeeded
235 break;
236 }
237
238 if ( dst )
239 {
240 if ( dstWritten > dstLen )
241 return wxCONV_FAILED;
242
243 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
244 return wxCONV_FAILED;
245
246 dst += lenChunk;
247 }
248
249 if ( !srcEnd )
250 {
251 // we convert just one chunk in this case as this is the entire
252 // string anyhow
253 break;
254 }
255
256 // advance the input pointer past the end of this chunk
257 while ( NotAllNULs(src, nulLen) )
258 {
259 // notice that we must skip over multiple bytes here as we suppose
260 // that if NUL takes 2 or 4 bytes, then all the other characters do
261 // too and so if advanced by a single byte we might erroneously
262 // detect sequences of NUL bytes in the middle of the input
263 src += nulLen;
264 }
265
266 src += nulLen; // skipping over its terminator as well
267
268 // note that ">=" (and not just "==") is needed here as the terminator
269 // we skipped just above could be inside or just after the buffer
270 // delimited by inEnd
271 if ( src >= srcEnd )
272 break;
273 }
274
275 return dstWritten;
276 }
277
278 size_t
279 wxMBConv::FromWChar(char *dst, size_t dstLen,
280 const wchar_t *src, size_t srcLen) const
281 {
282 // the number of chars [which would be] written to dst [if it were not NULL]
283 size_t dstWritten = 0;
284
285 // make a copy of the input string unless it is already properly
286 // NUL-terminated
287 //
288 // if we don't know its length we have no choice but to assume that it is,
289 // indeed, properly terminated
290 wxWCharBuffer bufTmp;
291 if ( srcLen == wxNO_LEN )
292 {
293 srcLen = wxWcslen(src) + 1;
294 }
295 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
296 {
297 // make a copy in order to properly NUL-terminate the string
298 bufTmp = wxWCharBuffer(srcLen);
299 memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
300 src = bufTmp;
301 }
302
303 const size_t lenNul = GetMBNulLen();
304 for ( const wchar_t * const srcEnd = src + srcLen;
305 src < srcEnd;
306 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
307 {
308 // try to convert the current chunk
309 size_t lenChunk = WC2MB(NULL, src, 0);
310
311 if ( lenChunk == wxCONV_FAILED )
312 return wxCONV_FAILED;
313
314 lenChunk += lenNul;
315 dstWritten += lenChunk;
316
317 if ( dst )
318 {
319 if ( dstWritten > dstLen )
320 return wxCONV_FAILED;
321
322 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
323 return wxCONV_FAILED;
324
325 dst += lenChunk;
326 }
327 }
328
329 return dstWritten;
330 }
331
332 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
333 {
334 size_t rc = ToWChar(out, outLen, in);
335 if ( rc != wxCONV_FAILED )
336 {
337 // ToWChar() returns the buffer length, i.e. including the trailing
338 // NUL, while this method doesn't take it into account
339 rc--;
340 }
341
342 return rc;
343 }
344
345 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
346 {
347 size_t rc = FromWChar(out, outLen, in);
348 if ( rc != wxCONV_FAILED )
349 {
350 rc -= GetMBNulLen();
351 }
352
353 return rc;
354 }
355
356 wxMBConv::~wxMBConv()
357 {
358 // nothing to do here (necessary for Darwin linking probably)
359 }
360
361 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
362 {
363 if ( psz )
364 {
365 // calculate the length of the buffer needed first
366 const size_t nLen = MB2WC(NULL, psz, 0);
367 if ( nLen != wxCONV_FAILED )
368 {
369 // now do the actual conversion
370 wxWCharBuffer buf(nLen /* +1 added implicitly */);
371
372 // +1 for the trailing NULL
373 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
374 return buf;
375 }
376 }
377
378 return wxWCharBuffer();
379 }
380
381 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
382 {
383 if ( pwz )
384 {
385 const size_t nLen = WC2MB(NULL, pwz, 0);
386 if ( nLen != wxCONV_FAILED )
387 {
388 // extra space for trailing NUL(s)
389 static const size_t extraLen = GetMaxMBNulLen();
390
391 wxCharBuffer buf(nLen + extraLen - 1);
392 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
393 return buf;
394 }
395 }
396
397 return wxCharBuffer();
398 }
399
400 const wxWCharBuffer
401 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
402 {
403 const size_t dstLen = ToWChar(NULL, 0, in, inLen);
404 if ( dstLen != wxCONV_FAILED )
405 {
406 wxWCharBuffer wbuf(dstLen - 1);
407 if ( ToWChar(wbuf.data(), dstLen, in, inLen) != wxCONV_FAILED )
408 {
409 if ( outLen )
410 {
411 *outLen = dstLen;
412 if ( wbuf[dstLen - 1] == L'\0' )
413 (*outLen)--;
414 }
415
416 return wbuf;
417 }
418 }
419
420 if ( outLen )
421 *outLen = 0;
422
423 return wxWCharBuffer();
424 }
425
426 const wxCharBuffer
427 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
428 {
429 const size_t dstLen = FromWChar(NULL, 0, in, inLen);
430 if ( dstLen != wxCONV_FAILED )
431 {
432 wxCharBuffer buf(dstLen - 1);
433 if ( FromWChar(buf.data(), dstLen, in, inLen) != wxCONV_FAILED )
434 {
435 if ( outLen )
436 {
437 *outLen = dstLen;
438
439 const size_t nulLen = GetMBNulLen();
440 if ( !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
441 {
442 // in this case the output is NUL-terminated and we're not
443 // supposed to count NUL
444 (*outLen) -= nulLen;
445 }
446 }
447
448 return buf;
449 }
450 }
451
452 if ( outLen )
453 *outLen = 0;
454
455 return wxCharBuffer();
456 }
457
458 // ----------------------------------------------------------------------------
459 // wxMBConvLibc
460 // ----------------------------------------------------------------------------
461
462 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
463 {
464 return wxMB2WC(buf, psz, n);
465 }
466
467 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
468 {
469 return wxWC2MB(buf, psz, n);
470 }
471
472 // ----------------------------------------------------------------------------
473 // wxConvBrokenFileNames
474 // ----------------------------------------------------------------------------
475
476 #ifdef __UNIX__
477
478 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
479 {
480 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
481 || wxStricmp(charset, _T("UTF8")) == 0 )
482 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
483 else
484 m_conv = new wxCSConv(charset);
485 }
486
487 #endif // __UNIX__
488
489 // ----------------------------------------------------------------------------
490 // UTF-7
491 // ----------------------------------------------------------------------------
492
493 // Implementation (C) 2004 Fredrik Roubert
494
495 //
496 // BASE64 decoding table
497 //
498 static const unsigned char utf7unb64[] =
499 {
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
506 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
507 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
509 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
510 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
511 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
513 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
514 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
515 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
532 };
533
534 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
535 {
536 size_t len = 0;
537
538 while ( *psz && (!buf || (len < n)) )
539 {
540 unsigned char cc = *psz++;
541 if (cc != '+')
542 {
543 // plain ASCII char
544 if (buf)
545 *buf++ = cc;
546 len++;
547 }
548 else if (*psz == '-')
549 {
550 // encoded plus sign
551 if (buf)
552 *buf++ = cc;
553 len++;
554 psz++;
555 }
556 else // start of BASE64 encoded string
557 {
558 bool lsb, ok;
559 unsigned int d, l;
560 for ( ok = lsb = false, d = 0, l = 0;
561 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
562 psz++ )
563 {
564 d <<= 6;
565 d += cc;
566 for (l += 6; l >= 8; lsb = !lsb)
567 {
568 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
569 if (lsb)
570 {
571 if (buf)
572 *buf++ |= c;
573 len ++;
574 }
575 else
576 {
577 if (buf)
578 *buf = (wchar_t)(c << 8);
579 }
580
581 ok = true;
582 }
583 }
584
585 if ( !ok )
586 {
587 // in valid UTF7 we should have valid characters after '+'
588 return wxCONV_FAILED;
589 }
590
591 if (*psz == '-')
592 psz++;
593 }
594 }
595
596 if ( buf && (len < n) )
597 *buf = '\0';
598
599 return len;
600 }
601
602 //
603 // BASE64 encoding table
604 //
605 static const unsigned char utf7enb64[] =
606 {
607 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
608 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
609 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
610 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
611 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
612 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
613 'w', 'x', 'y', 'z', '0', '1', '2', '3',
614 '4', '5', '6', '7', '8', '9', '+', '/'
615 };
616
617 //
618 // UTF-7 encoding table
619 //
620 // 0 - Set D (directly encoded characters)
621 // 1 - Set O (optional direct characters)
622 // 2 - whitespace characters (optional)
623 // 3 - special characters
624 //
625 static const unsigned char utf7encode[128] =
626 {
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
628 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
629 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
631 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
633 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
635 };
636
637 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
638 {
639 size_t len = 0;
640
641 while (*psz && ((!buf) || (len < n)))
642 {
643 wchar_t cc = *psz++;
644 if (cc < 0x80 && utf7encode[cc] < 1)
645 {
646 // plain ASCII char
647 if (buf)
648 *buf++ = (char)cc;
649 len++;
650 }
651 #ifndef WC_UTF16
652 else if (((wxUint32)cc) > 0xffff)
653 {
654 // no surrogate pair generation (yet?)
655 return wxCONV_FAILED;
656 }
657 #endif
658 else
659 {
660 if (buf)
661 *buf++ = '+';
662 len++;
663 if (cc != '+')
664 {
665 // BASE64 encode string
666 unsigned int lsb, d, l;
667 for (d = 0, l = 0; /*nothing*/; psz++)
668 {
669 for (lsb = 0; lsb < 2; lsb ++)
670 {
671 d <<= 8;
672 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
673
674 for (l += 8; l >= 6; )
675 {
676 l -= 6;
677 if (buf)
678 *buf++ = utf7enb64[(d >> l) % 64];
679 len++;
680 }
681 }
682 cc = *psz;
683 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
684 break;
685 }
686 if (l != 0)
687 {
688 if (buf)
689 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
690 len++;
691 }
692 }
693 if (buf)
694 *buf++ = '-';
695 len++;
696 }
697 }
698 if (buf && (len < n))
699 *buf = 0;
700 return len;
701 }
702
703 // ----------------------------------------------------------------------------
704 // UTF-8
705 // ----------------------------------------------------------------------------
706
707 static wxUint32 utf8_max[]=
708 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
709
710 // boundaries of the private use area we use to (temporarily) remap invalid
711 // characters invalid in a UTF-8 encoded string
712 const wxUint32 wxUnicodePUA = 0x100000;
713 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
714
715 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
716 {
717 size_t len = 0;
718
719 while (*psz && ((!buf) || (len < n)))
720 {
721 const char *opsz = psz;
722 bool invalid = false;
723 unsigned char cc = *psz++, fc = cc;
724 unsigned cnt;
725 for (cnt = 0; fc & 0x80; cnt++)
726 fc <<= 1;
727 if (!cnt)
728 {
729 // plain ASCII char
730 if (buf)
731 *buf++ = cc;
732 len++;
733
734 // escape the escape character for octal escapes
735 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
736 && cc == '\\' && (!buf || len < n))
737 {
738 if (buf)
739 *buf++ = cc;
740 len++;
741 }
742 }
743 else
744 {
745 cnt--;
746 if (!cnt)
747 {
748 // invalid UTF-8 sequence
749 invalid = true;
750 }
751 else
752 {
753 unsigned ocnt = cnt - 1;
754 wxUint32 res = cc & (0x3f >> cnt);
755 while (cnt--)
756 {
757 cc = *psz;
758 if ((cc & 0xC0) != 0x80)
759 {
760 // invalid UTF-8 sequence
761 invalid = true;
762 break;
763 }
764 psz++;
765 res = (res << 6) | (cc & 0x3f);
766 }
767 if (invalid || res <= utf8_max[ocnt])
768 {
769 // illegal UTF-8 encoding
770 invalid = true;
771 }
772 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
773 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
774 {
775 // if one of our PUA characters turns up externally
776 // it must also be treated as an illegal sequence
777 // (a bit like you have to escape an escape character)
778 invalid = true;
779 }
780 else
781 {
782 #ifdef WC_UTF16
783 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
784 size_t pa = encode_utf16(res, (wxUint16 *)buf);
785 if (pa == wxCONV_FAILED)
786 {
787 invalid = true;
788 }
789 else
790 {
791 if (buf)
792 buf += pa;
793 len += pa;
794 }
795 #else // !WC_UTF16
796 if (buf)
797 *buf++ = (wchar_t)res;
798 len++;
799 #endif // WC_UTF16/!WC_UTF16
800 }
801 }
802 if (invalid)
803 {
804 if (m_options & MAP_INVALID_UTF8_TO_PUA)
805 {
806 while (opsz < psz && (!buf || len < n))
807 {
808 #ifdef WC_UTF16
809 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
810 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
811 wxASSERT(pa != wxCONV_FAILED);
812 if (buf)
813 buf += pa;
814 opsz++;
815 len += pa;
816 #else
817 if (buf)
818 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
819 opsz++;
820 len++;
821 #endif
822 }
823 }
824 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
825 {
826 while (opsz < psz && (!buf || len < n))
827 {
828 if ( buf && len + 3 < n )
829 {
830 unsigned char on = *opsz;
831 *buf++ = L'\\';
832 *buf++ = (wchar_t)( L'0' + on / 0100 );
833 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
834 *buf++ = (wchar_t)( L'0' + on % 010 );
835 }
836 opsz++;
837 len += 4;
838 }
839 }
840 else // MAP_INVALID_UTF8_NOT
841 {
842 return wxCONV_FAILED;
843 }
844 }
845 }
846 }
847 if (buf && (len < n))
848 *buf = 0;
849 return len;
850 }
851
852 static inline bool isoctal(wchar_t wch)
853 {
854 return L'0' <= wch && wch <= L'7';
855 }
856
857 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
858 {
859 size_t len = 0;
860
861 while (*psz && ((!buf) || (len < n)))
862 {
863 wxUint32 cc;
864 #ifdef WC_UTF16
865 // cast is ok for WC_UTF16
866 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
867 psz += (pa == wxCONV_FAILED) ? 1 : pa;
868 #else
869 cc=(*psz++) & 0x7fffffff;
870 #endif
871
872 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
873 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
874 {
875 if (buf)
876 *buf++ = (char)(cc - wxUnicodePUA);
877 len++;
878 }
879 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
880 && cc == L'\\' && psz[0] == L'\\' )
881 {
882 if (buf)
883 *buf++ = (char)cc;
884 psz++;
885 len++;
886 }
887 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
888 cc == L'\\' &&
889 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
890 {
891 if (buf)
892 {
893 *buf++ = (char) ((psz[0] - L'0')*0100 +
894 (psz[1] - L'0')*010 +
895 (psz[2] - L'0'));
896 }
897
898 psz += 3;
899 len++;
900 }
901 else
902 {
903 unsigned cnt;
904 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
905 if (!cnt)
906 {
907 // plain ASCII char
908 if (buf)
909 *buf++ = (char) cc;
910 len++;
911 }
912
913 else
914 {
915 len += cnt + 1;
916 if (buf)
917 {
918 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
919 while (cnt--)
920 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
921 }
922 }
923 }
924 }
925
926 if (buf && (len<n))
927 *buf = 0;
928
929 return len;
930 }
931
932 // ============================================================================
933 // UTF-16
934 // ============================================================================
935
936 #ifdef WORDS_BIGENDIAN
937 #define wxMBConvUTF16straight wxMBConvUTF16BE
938 #define wxMBConvUTF16swap wxMBConvUTF16LE
939 #else
940 #define wxMBConvUTF16swap wxMBConvUTF16BE
941 #define wxMBConvUTF16straight wxMBConvUTF16LE
942 #endif
943
944 /* static */
945 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
946 {
947 if ( srcLen == wxNO_LEN )
948 {
949 // count the number of bytes in input, including the trailing NULs
950 const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
951 for ( srcLen = 1; *in++; srcLen++ )
952 ;
953
954 srcLen *= BYTES_PER_CHAR;
955 }
956 else // we already have the length
957 {
958 // we can only convert an entire number of UTF-16 characters
959 if ( srcLen % BYTES_PER_CHAR )
960 return wxCONV_FAILED;
961 }
962
963 return srcLen;
964 }
965
966 // case when in-memory representation is UTF-16 too
967 #ifdef WC_UTF16
968
969 // ----------------------------------------------------------------------------
970 // conversions without endianness change
971 // ----------------------------------------------------------------------------
972
973 size_t
974 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
975 const char *src, size_t srcLen) const
976 {
977 // set up the scene for using memcpy() (which is presumably more efficient
978 // than copying the bytes one by one)
979 srcLen = GetLength(src, srcLen);
980 if ( srcLen == wxNO_LEN )
981 return wxCONV_FAILED;
982
983 const size_t inLen = srcLen/BYTES_PER_CHAR;
984 if ( dst )
985 {
986 if ( dstLen < inLen )
987 return wxCONV_FAILED;
988
989 memcpy(dst, src, srcLen);
990 }
991
992 return inLen;
993 }
994
995 size_t
996 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
997 const wchar_t *src, size_t srcLen) const
998 {
999 if ( srcLen == wxNO_LEN )
1000 srcLen = wxWcslen(src) + 1;
1001
1002 srcLen *= BYTES_PER_CHAR;
1003
1004 if ( dst )
1005 {
1006 if ( dstLen < srcLen )
1007 return wxCONV_FAILED;
1008
1009 memcpy(dst, src, srcLen);
1010 }
1011
1012 return srcLen;
1013 }
1014
1015 // ----------------------------------------------------------------------------
1016 // endian-reversing conversions
1017 // ----------------------------------------------------------------------------
1018
1019 size_t
1020 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1021 const char *src, size_t srcLen) const
1022 {
1023 srcLen = GetLength(src, srcLen);
1024 if ( srcLen == wxNO_LEN )
1025 return wxCONV_FAILED;
1026
1027 srcLen /= BYTES_PER_CHAR;
1028
1029 if ( dst )
1030 {
1031 if ( dstLen < srcLen )
1032 return wxCONV_FAILED;
1033
1034 const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
1035 for ( size_t n = 0; n < srcLen; n++, in++ )
1036 {
1037 *dst++ = wxUINT16_SWAP_ALWAYS(*in);
1038 }
1039 }
1040
1041 return srcLen;
1042 }
1043
1044 size_t
1045 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1046 const wchar_t *src, size_t srcLen) const
1047 {
1048 if ( srcLen == wxNO_LEN )
1049 srcLen = wxWcslen(src) + 1;
1050
1051 srcLen *= BYTES_PER_CHAR;
1052
1053 if ( dst )
1054 {
1055 if ( dstLen < srcLen )
1056 return wxCONV_FAILED;
1057
1058 wxUint16 *out = wx_reinterpret_cast(wxUint16 *, dst);
1059 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1060 {
1061 *out++ = wxUINT16_SWAP_ALWAYS(*src);
1062 }
1063 }
1064
1065 return srcLen;
1066 }
1067
1068 #else // !WC_UTF16: wchar_t is UTF-32
1069
1070 // ----------------------------------------------------------------------------
1071 // conversions without endianness change
1072 // ----------------------------------------------------------------------------
1073
1074 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1075 {
1076 size_t len=0;
1077
1078 while (*(wxUint16*)psz && (!buf || len < n))
1079 {
1080 wxUint32 cc;
1081 size_t pa=decode_utf16((wxUint16*)psz, cc);
1082 if (pa == wxCONV_FAILED)
1083 return pa;
1084
1085 if (buf)
1086 *buf++ = (wchar_t)cc;
1087 len++;
1088 psz += pa * sizeof(wxUint16);
1089 }
1090 if (buf && len<n) *buf=0;
1091
1092 return len;
1093 }
1094
1095
1096 // copy 32bit String to 16bit MB
1097 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1098 {
1099 size_t len=0;
1100
1101 while (*psz && (!buf || len < n))
1102 {
1103 wxUint16 cc[2];
1104 size_t pa=encode_utf16(*psz, cc);
1105
1106 if (pa == wxCONV_FAILED)
1107 return pa;
1108
1109 if (buf)
1110 {
1111 *(wxUint16*)buf = cc[0];
1112 buf += sizeof(wxUint16);
1113 if (pa > 1)
1114 {
1115 *(wxUint16*)buf = cc[1];
1116 buf += sizeof(wxUint16);
1117 }
1118 }
1119
1120 len += pa*sizeof(wxUint16);
1121 psz++;
1122 }
1123 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1124
1125 return len;
1126 }
1127
1128 // ----------------------------------------------------------------------------
1129 // endian-reversing conversions
1130 // ----------------------------------------------------------------------------
1131
1132 // swap 16bit MB to 32bit String
1133 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1134 {
1135 size_t len=0;
1136
1137 while (*(wxUint16*)psz && (!buf || len < n))
1138 {
1139 wxUint32 cc;
1140 char tmp[4];
1141 tmp[0]=psz[1]; tmp[1]=psz[0];
1142 tmp[2]=psz[3]; tmp[3]=psz[2];
1143
1144 size_t pa=decode_utf16((wxUint16*)tmp, cc);
1145 if (pa == wxCONV_FAILED)
1146 return pa;
1147
1148 if (buf)
1149 *buf++ = (wchar_t)cc;
1150
1151 len++;
1152 psz += pa * sizeof(wxUint16);
1153 }
1154 if (buf && len<n) *buf=0;
1155
1156 return len;
1157 }
1158
1159
1160 // swap 32bit String to 16bit MB
1161 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1162 {
1163 size_t len=0;
1164
1165 while (*psz && (!buf || len < n))
1166 {
1167 wxUint16 cc[2];
1168 size_t pa=encode_utf16(*psz, cc);
1169
1170 if (pa == wxCONV_FAILED)
1171 return pa;
1172
1173 if (buf)
1174 {
1175 *buf++ = ((char*)cc)[1];
1176 *buf++ = ((char*)cc)[0];
1177 if (pa > 1)
1178 {
1179 *buf++ = ((char*)cc)[3];
1180 *buf++ = ((char*)cc)[2];
1181 }
1182 }
1183
1184 len += pa*sizeof(wxUint16);
1185 psz++;
1186 }
1187 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
1188
1189 return len;
1190 }
1191
1192 #endif // WC_UTF16/!WC_UTF16
1193
1194
1195 // ----------------------------------------------------------------------------
1196 // UTF-32
1197 // ----------------------------------------------------------------------------
1198
1199 #ifdef WORDS_BIGENDIAN
1200 #define wxMBConvUTF32straight wxMBConvUTF32BE
1201 #define wxMBConvUTF32swap wxMBConvUTF32LE
1202 #else
1203 #define wxMBConvUTF32swap wxMBConvUTF32BE
1204 #define wxMBConvUTF32straight wxMBConvUTF32LE
1205 #endif
1206
1207
1208 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1209 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1210
1211 /* static */
1212 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1213 {
1214 if ( srcLen == wxNO_LEN )
1215 {
1216 // count the number of bytes in input, including the trailing NULs
1217 const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1218 for ( srcLen = 1; *in++; srcLen++ )
1219 ;
1220
1221 srcLen *= BYTES_PER_CHAR;
1222 }
1223 else // we already have the length
1224 {
1225 // we can only convert an entire number of UTF-32 characters
1226 if ( srcLen % BYTES_PER_CHAR )
1227 return wxCONV_FAILED;
1228 }
1229
1230 return srcLen;
1231 }
1232
1233 // case when in-memory representation is UTF-16
1234 #ifdef WC_UTF16
1235
1236 // ----------------------------------------------------------------------------
1237 // conversions without endianness change
1238 // ----------------------------------------------------------------------------
1239
1240 size_t
1241 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1242 const char *src, size_t srcLen) const
1243 {
1244 srcLen = GetLength(src, srcLen);
1245 if ( srcLen == wxNO_LEN )
1246 return wxCONV_FAILED;
1247
1248 const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1249 const size_t inLen = srcLen/BYTES_PER_CHAR;
1250 size_t outLen = 0;
1251 for ( size_t n = 0; n < inLen; n++ )
1252 {
1253 wxUint16 cc[2];
1254 const size_t numChars = encode_utf16(*in++, cc);
1255 if ( numChars == wxCONV_FAILED )
1256 return wxCONV_FAILED;
1257
1258 outLen += numChars;
1259 if ( dst )
1260 {
1261 if ( outLen > dstLen )
1262 return wxCONV_FAILED;
1263
1264 *dst++ = cc[0];
1265 if ( numChars == 2 )
1266 {
1267 // second character of a surrogate
1268 *dst++ = cc[1];
1269 }
1270 }
1271 }
1272
1273 return outLen;
1274 }
1275
1276 size_t
1277 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1278 const wchar_t *src, size_t srcLen) const
1279 {
1280 if ( srcLen == wxNO_LEN )
1281 srcLen = wxWcslen(src) + 1;
1282
1283 if ( !dst )
1284 {
1285 // optimization: return maximal space which could be needed for this
1286 // string instead of the exact amount which could be less if there are
1287 // any surrogates in the input
1288 //
1289 // we consider that surrogates are rare enough to make it worthwhile to
1290 // avoid running the loop below at the cost of slightly extra memory
1291 // consumption
1292 return srcLen*BYTES_PER_CHAR;
1293 }
1294
1295 wxUint32 *out = wx_reinterpret_cast(wxUint32 *, dst);
1296 size_t outLen = 0;
1297 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1298 {
1299 const wxUint32 ch = wxDecodeSurrogate(&src);
1300 if ( !src )
1301 return wxCONV_FAILED;
1302
1303 outLen += BYTES_PER_CHAR;
1304
1305 if ( outLen > dstLen )
1306 return wxCONV_FAILED;
1307
1308 *out++ = ch;
1309 }
1310
1311 return outLen;
1312 }
1313
1314 // ----------------------------------------------------------------------------
1315 // endian-reversing conversions
1316 // ----------------------------------------------------------------------------
1317
1318 size_t
1319 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1320 const char *src, size_t srcLen) const
1321 {
1322 srcLen = GetLength(src, srcLen);
1323 if ( srcLen == wxNO_LEN )
1324 return wxCONV_FAILED;
1325
1326 const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1327 const size_t inLen = srcLen/BYTES_PER_CHAR;
1328 size_t outLen = 0;
1329 for ( size_t n = 0; n < inLen; n++, in++ )
1330 {
1331 wxUint16 cc[2];
1332 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*in), cc);
1333 if ( numChars == wxCONV_FAILED )
1334 return wxCONV_FAILED;
1335
1336 outLen += numChars;
1337 if ( dst )
1338 {
1339 if ( outLen > dstLen )
1340 return wxCONV_FAILED;
1341
1342 *dst++ = cc[0];
1343 if ( numChars == 2 )
1344 {
1345 // second character of a surrogate
1346 *dst++ = cc[1];
1347 }
1348 }
1349 }
1350
1351 return outLen;
1352 }
1353
1354 size_t
1355 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1356 const wchar_t *src, size_t srcLen) const
1357 {
1358 if ( srcLen == wxNO_LEN )
1359 srcLen = wxWcslen(src) + 1;
1360
1361 if ( !dst )
1362 {
1363 // optimization: return maximal space which could be needed for this
1364 // string instead of the exact amount which could be less if there are
1365 // any surrogates in the input
1366 //
1367 // we consider that surrogates are rare enough to make it worthwhile to
1368 // avoid running the loop below at the cost of slightly extra memory
1369 // consumption
1370 return srcLen*BYTES_PER_CHAR;
1371 }
1372
1373 wxUint32 *out = wx_reinterpret_cast(wxUint32 *, dst);
1374 size_t outLen = 0;
1375 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1376 {
1377 const wxUint32 ch = wxDecodeSurrogate(&src);
1378 if ( !src )
1379 return wxCONV_FAILED;
1380
1381 outLen += BYTES_PER_CHAR;
1382
1383 if ( outLen > dstLen )
1384 return wxCONV_FAILED;
1385
1386 *out++ = wxUINT32_SWAP_ALWAYS(ch);
1387 }
1388
1389 return outLen;
1390 }
1391
1392 #else // !WC_UTF16: wchar_t is UTF-32
1393
1394 // copy 32bit MB to 32bit String
1395 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1396 {
1397 size_t len=0;
1398
1399 while (*(wxUint32*)psz && (!buf || len < n))
1400 {
1401 if (buf)
1402 *buf++ = (wchar_t)(*(wxUint32*)psz);
1403 len++;
1404 psz += sizeof(wxUint32);
1405 }
1406
1407 if (buf && len<n)
1408 *buf=0;
1409
1410 return len;
1411 }
1412
1413
1414 // copy 32bit String to 32bit MB
1415 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1416 {
1417 size_t len=0;
1418
1419 while (*psz && (!buf || len < n))
1420 {
1421 if (buf)
1422 {
1423 *(wxUint32*)buf = *psz;
1424 buf += sizeof(wxUint32);
1425 }
1426
1427 len += sizeof(wxUint32);
1428 psz++;
1429 }
1430
1431 if (buf && len<=n-sizeof(wxUint32))
1432 *(wxUint32*)buf=0;
1433
1434 return len;
1435 }
1436
1437
1438 // swap 32bit MB to 32bit String
1439 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1440 {
1441 size_t len=0;
1442
1443 while (*(wxUint32*)psz && (!buf || len < n))
1444 {
1445 if (buf)
1446 {
1447 ((char *)buf)[0] = psz[3];
1448 ((char *)buf)[1] = psz[2];
1449 ((char *)buf)[2] = psz[1];
1450 ((char *)buf)[3] = psz[0];
1451 buf++;
1452 }
1453 len++;
1454 psz += sizeof(wxUint32);
1455 }
1456
1457 if (buf && len<n)
1458 *buf=0;
1459
1460 return len;
1461 }
1462
1463
1464 // swap 32bit String to 32bit MB
1465 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1466 {
1467 size_t len=0;
1468
1469 while (*psz && (!buf || len < n))
1470 {
1471 if (buf)
1472 {
1473 *buf++ = ((char *)psz)[3];
1474 *buf++ = ((char *)psz)[2];
1475 *buf++ = ((char *)psz)[1];
1476 *buf++ = ((char *)psz)[0];
1477 }
1478 len += sizeof(wxUint32);
1479 psz++;
1480 }
1481
1482 if (buf && len<=n-sizeof(wxUint32))
1483 *(wxUint32*)buf=0;
1484
1485 return len;
1486 }
1487
1488
1489 #endif // WC_UTF16/!WC_UTF16
1490
1491
1492 // ============================================================================
1493 // The classes doing conversion using the iconv_xxx() functions
1494 // ============================================================================
1495
1496 #ifdef HAVE_ICONV
1497
1498 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1499 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1500 // (unless there's yet another bug in glibc) the only case when iconv()
1501 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1502 // left in the input buffer -- when _real_ error occurs,
1503 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1504 // iconv() failure.
1505 // [This bug does not appear in glibc 2.2.]
1506 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1507 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1508 (errno != E2BIG || bufLeft != 0))
1509 #else
1510 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1511 #endif
1512
1513 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1514
1515 #define ICONV_T_INVALID ((iconv_t)-1)
1516
1517 #if SIZEOF_WCHAR_T == 4
1518 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1519 #define WC_ENC wxFONTENCODING_UTF32
1520 #elif SIZEOF_WCHAR_T == 2
1521 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1522 #define WC_ENC wxFONTENCODING_UTF16
1523 #else // sizeof(wchar_t) != 2 nor 4
1524 // does this ever happen?
1525 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1526 #endif
1527
1528 // ----------------------------------------------------------------------------
1529 // wxMBConv_iconv: encapsulates an iconv character set
1530 // ----------------------------------------------------------------------------
1531
1532 class wxMBConv_iconv : public wxMBConv
1533 {
1534 public:
1535 wxMBConv_iconv(const wxChar *name);
1536 virtual ~wxMBConv_iconv();
1537
1538 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1539 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1540
1541 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1542 virtual size_t GetMBNulLen() const;
1543
1544 virtual wxMBConv *Clone() const
1545 {
1546 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1547 p->m_minMBCharWidth = m_minMBCharWidth;
1548 return p;
1549 }
1550
1551 bool IsOk() const
1552 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1553
1554 protected:
1555 // the iconv handlers used to translate from multibyte to wide char and in
1556 // the other direction
1557 iconv_t m2w,
1558 w2m;
1559 #if wxUSE_THREADS
1560 // guards access to m2w and w2m objects
1561 wxMutex m_iconvMutex;
1562 #endif
1563
1564 private:
1565 // the name (for iconv_open()) of a wide char charset -- if none is
1566 // available on this machine, it will remain NULL
1567 static wxString ms_wcCharsetName;
1568
1569 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1570 // different endian-ness than the native one
1571 static bool ms_wcNeedsSwap;
1572
1573
1574 // name of the encoding handled by this conversion
1575 wxString m_name;
1576
1577 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1578 // initially
1579 size_t m_minMBCharWidth;
1580 };
1581
1582 // make the constructor available for unit testing
1583 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1584 {
1585 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1586 if ( !result->IsOk() )
1587 {
1588 delete result;
1589 return 0;
1590 }
1591 return result;
1592 }
1593
1594 wxString wxMBConv_iconv::ms_wcCharsetName;
1595 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1596
1597 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1598 : m_name(name)
1599 {
1600 m_minMBCharWidth = 0;
1601
1602 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1603 // names for the charsets
1604 const wxCharBuffer cname(wxString(name).ToAscii());
1605
1606 // check for charset that represents wchar_t:
1607 if ( ms_wcCharsetName.empty() )
1608 {
1609 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1610
1611 #if wxUSE_FONTMAP
1612 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1613 #else // !wxUSE_FONTMAP
1614 static const wxChar *names[] =
1615 {
1616 #if SIZEOF_WCHAR_T == 4
1617 _T("UCS-4"),
1618 #elif SIZEOF_WCHAR_T = 2
1619 _T("UCS-2"),
1620 #endif
1621 NULL
1622 };
1623 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1624
1625 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1626 {
1627 const wxString nameCS(*names);
1628
1629 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1630 wxString nameXE(nameCS);
1631 #ifdef WORDS_BIGENDIAN
1632 nameXE += _T("BE");
1633 #else // little endian
1634 nameXE += _T("LE");
1635 #endif
1636
1637 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1638 nameXE.c_str());
1639
1640 m2w = iconv_open(nameXE.ToAscii(), cname);
1641 if ( m2w == ICONV_T_INVALID )
1642 {
1643 // try charset w/o bytesex info (e.g. "UCS4")
1644 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1645 nameCS.c_str());
1646 m2w = iconv_open(nameCS.ToAscii(), cname);
1647
1648 // and check for bytesex ourselves:
1649 if ( m2w != ICONV_T_INVALID )
1650 {
1651 char buf[2], *bufPtr;
1652 wchar_t wbuf[2], *wbufPtr;
1653 size_t insz, outsz;
1654 size_t res;
1655
1656 buf[0] = 'A';
1657 buf[1] = 0;
1658 wbuf[0] = 0;
1659 insz = 2;
1660 outsz = SIZEOF_WCHAR_T * 2;
1661 wbufPtr = wbuf;
1662 bufPtr = buf;
1663
1664 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1665 (char**)&wbufPtr, &outsz);
1666
1667 if (ICONV_FAILED(res, insz))
1668 {
1669 wxLogLastError(wxT("iconv"));
1670 wxLogError(_("Conversion to charset '%s' doesn't work."),
1671 nameCS.c_str());
1672 }
1673 else // ok, can convert to this encoding, remember it
1674 {
1675 ms_wcCharsetName = nameCS;
1676 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1677 }
1678 }
1679 }
1680 else // use charset not requiring byte swapping
1681 {
1682 ms_wcCharsetName = nameXE;
1683 }
1684 }
1685
1686 wxLogTrace(TRACE_STRCONV,
1687 wxT("iconv wchar_t charset is \"%s\"%s"),
1688 ms_wcCharsetName.empty() ? _T("<none>")
1689 : ms_wcCharsetName.c_str(),
1690 ms_wcNeedsSwap ? _T(" (needs swap)")
1691 : _T(""));
1692 }
1693 else // we already have ms_wcCharsetName
1694 {
1695 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1696 }
1697
1698 if ( ms_wcCharsetName.empty() )
1699 {
1700 w2m = ICONV_T_INVALID;
1701 }
1702 else
1703 {
1704 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1705 if ( w2m == ICONV_T_INVALID )
1706 {
1707 wxLogTrace(TRACE_STRCONV,
1708 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1709 ms_wcCharsetName.c_str(), cname.data());
1710 }
1711 }
1712 }
1713
1714 wxMBConv_iconv::~wxMBConv_iconv()
1715 {
1716 if ( m2w != ICONV_T_INVALID )
1717 iconv_close(m2w);
1718 if ( w2m != ICONV_T_INVALID )
1719 iconv_close(w2m);
1720 }
1721
1722 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1723 {
1724 // find the string length: notice that must be done differently for
1725 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1726 size_t inbuf;
1727 const size_t nulLen = GetMBNulLen();
1728 switch ( nulLen )
1729 {
1730 default:
1731 return wxCONV_FAILED;
1732
1733 case 1:
1734 inbuf = strlen(psz); // arguably more optimized than our version
1735 break;
1736
1737 case 2:
1738 case 4:
1739 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1740 // they also have to start at character boundary and not span two
1741 // adjacent characters
1742 const char *p;
1743 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1744 ;
1745 inbuf = p - psz;
1746 break;
1747 }
1748
1749 #if wxUSE_THREADS
1750 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1751 // Unfortunately there is a couple of global wxCSConv objects such as
1752 // wxConvLocal that are used all over wx code, so we have to make sure
1753 // the handle is used by at most one thread at the time. Otherwise
1754 // only a few wx classes would be safe to use from non-main threads
1755 // as MB<->WC conversion would fail "randomly".
1756 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1757 #endif // wxUSE_THREADS
1758
1759
1760 size_t outbuf = n * SIZEOF_WCHAR_T;
1761 size_t res, cres;
1762 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1763 wchar_t *bufPtr = buf;
1764 const char *pszPtr = psz;
1765
1766 if (buf)
1767 {
1768 // have destination buffer, convert there
1769 cres = iconv(m2w,
1770 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1771 (char**)&bufPtr, &outbuf);
1772 res = n - (outbuf / SIZEOF_WCHAR_T);
1773
1774 if (ms_wcNeedsSwap)
1775 {
1776 // convert to native endianness
1777 for ( unsigned i = 0; i < res; i++ )
1778 buf[n] = WC_BSWAP(buf[i]);
1779 }
1780
1781 // NUL-terminate the string if there is any space left
1782 if (res < n)
1783 buf[res] = 0;
1784 }
1785 else
1786 {
1787 // no destination buffer... convert using temp buffer
1788 // to calculate destination buffer requirement
1789 wchar_t tbuf[8];
1790 res = 0;
1791 do {
1792 bufPtr = tbuf;
1793 outbuf = 8*SIZEOF_WCHAR_T;
1794
1795 cres = iconv(m2w,
1796 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1797 (char**)&bufPtr, &outbuf );
1798
1799 res += 8-(outbuf/SIZEOF_WCHAR_T);
1800 } while ((cres==(size_t)-1) && (errno==E2BIG));
1801 }
1802
1803 if (ICONV_FAILED(cres, inbuf))
1804 {
1805 //VS: it is ok if iconv fails, hence trace only
1806 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1807 return wxCONV_FAILED;
1808 }
1809
1810 return res;
1811 }
1812
1813 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1814 {
1815 #if wxUSE_THREADS
1816 // NB: explained in MB2WC
1817 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1818 #endif
1819
1820 size_t inlen = wxWcslen(psz);
1821 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1822 size_t outbuf = n;
1823 size_t res, cres;
1824
1825 wchar_t *tmpbuf = 0;
1826
1827 if (ms_wcNeedsSwap)
1828 {
1829 // need to copy to temp buffer to switch endianness
1830 // (doing WC_BSWAP twice on the original buffer won't help, as it
1831 // could be in read-only memory, or be accessed in some other thread)
1832 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1833 for ( size_t i = 0; i < inlen; i++ )
1834 tmpbuf[n] = WC_BSWAP(psz[i]);
1835 tmpbuf[inlen] = L'\0';
1836 psz = tmpbuf;
1837 }
1838
1839 if (buf)
1840 {
1841 // have destination buffer, convert there
1842 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1843
1844 res = n-outbuf;
1845
1846 // NB: iconv was given only wcslen(psz) characters on input, and so
1847 // it couldn't convert the trailing zero. Let's do it ourselves
1848 // if there's some room left for it in the output buffer.
1849 if (res < n)
1850 buf[0] = 0;
1851 }
1852 else
1853 {
1854 // no destination buffer... convert using temp buffer
1855 // to calculate destination buffer requirement
1856 char tbuf[16];
1857 res = 0;
1858 do {
1859 buf = tbuf; outbuf = 16;
1860
1861 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1862
1863 res += 16 - outbuf;
1864 } while ((cres==(size_t)-1) && (errno==E2BIG));
1865 }
1866
1867 if (ms_wcNeedsSwap)
1868 {
1869 free(tmpbuf);
1870 }
1871
1872 if (ICONV_FAILED(cres, inbuf))
1873 {
1874 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1875 return wxCONV_FAILED;
1876 }
1877
1878 return res;
1879 }
1880
1881 size_t wxMBConv_iconv::GetMBNulLen() const
1882 {
1883 if ( m_minMBCharWidth == 0 )
1884 {
1885 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1886
1887 #if wxUSE_THREADS
1888 // NB: explained in MB2WC
1889 wxMutexLocker lock(self->m_iconvMutex);
1890 #endif
1891
1892 wchar_t *wnul = L"";
1893 char buf[8]; // should be enough for NUL in any encoding
1894 size_t inLen = sizeof(wchar_t),
1895 outLen = WXSIZEOF(buf);
1896 char *in = (char *)wnul;
1897 char *out = buf;
1898 if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1899 {
1900 self->m_minMBCharWidth = (size_t)-1;
1901 }
1902 else // ok
1903 {
1904 self->m_minMBCharWidth = out - buf;
1905 }
1906 }
1907
1908 return m_minMBCharWidth;
1909 }
1910
1911 #endif // HAVE_ICONV
1912
1913
1914 // ============================================================================
1915 // Win32 conversion classes
1916 // ============================================================================
1917
1918 #ifdef wxHAVE_WIN32_MB2WC
1919
1920 // from utils.cpp
1921 #if wxUSE_FONTMAP
1922 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1923 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1924 #endif
1925
1926 class wxMBConv_win32 : public wxMBConv
1927 {
1928 public:
1929 wxMBConv_win32()
1930 {
1931 m_CodePage = CP_ACP;
1932 m_minMBCharWidth = 0;
1933 }
1934
1935 wxMBConv_win32(const wxMBConv_win32& conv)
1936 {
1937 m_CodePage = conv.m_CodePage;
1938 m_minMBCharWidth = conv.m_minMBCharWidth;
1939 }
1940
1941 #if wxUSE_FONTMAP
1942 wxMBConv_win32(const wxChar* name)
1943 {
1944 m_CodePage = wxCharsetToCodepage(name);
1945 m_minMBCharWidth = 0;
1946 }
1947
1948 wxMBConv_win32(wxFontEncoding encoding)
1949 {
1950 m_CodePage = wxEncodingToCodepage(encoding);
1951 m_minMBCharWidth = 0;
1952 }
1953 #endif // wxUSE_FONTMAP
1954
1955 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1956 {
1957 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1958 // the behaviour is not compatible with the Unix version (using iconv)
1959 // and break the library itself, e.g. wxTextInputStream::NextChar()
1960 // wouldn't work if reading an incomplete MB char didn't result in an
1961 // error
1962 //
1963 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1964 // Win XP or newer and it is not supported for UTF-[78] so we always
1965 // use our own conversions in this case. See
1966 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1967 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1968 if ( m_CodePage == CP_UTF8 )
1969 {
1970 return wxConvUTF8.MB2WC(buf, psz, n);
1971 }
1972
1973 if ( m_CodePage == CP_UTF7 )
1974 {
1975 return wxConvUTF7.MB2WC(buf, psz, n);
1976 }
1977
1978 int flags = 0;
1979 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
1980 IsAtLeastWin2kSP4() )
1981 {
1982 flags = MB_ERR_INVALID_CHARS;
1983 }
1984
1985 const size_t len = ::MultiByteToWideChar
1986 (
1987 m_CodePage, // code page
1988 flags, // flags: fall on error
1989 psz, // input string
1990 -1, // its length (NUL-terminated)
1991 buf, // output string
1992 buf ? n : 0 // size of output buffer
1993 );
1994 if ( !len )
1995 {
1996 // function totally failed
1997 return wxCONV_FAILED;
1998 }
1999
2000 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2001 // check if we succeeded, by doing a double trip:
2002 if ( !flags && buf )
2003 {
2004 const size_t mbLen = strlen(psz);
2005 wxCharBuffer mbBuf(mbLen);
2006 if ( ::WideCharToMultiByte
2007 (
2008 m_CodePage,
2009 0,
2010 buf,
2011 -1,
2012 mbBuf.data(),
2013 mbLen + 1, // size in bytes, not length
2014 NULL,
2015 NULL
2016 ) == 0 ||
2017 strcmp(mbBuf, psz) != 0 )
2018 {
2019 // we didn't obtain the same thing we started from, hence
2020 // the conversion was lossy and we consider that it failed
2021 return wxCONV_FAILED;
2022 }
2023 }
2024
2025 // note that it returns count of written chars for buf != NULL and size
2026 // of the needed buffer for buf == NULL so in either case the length of
2027 // the string (which never includes the terminating NUL) is one less
2028 return len - 1;
2029 }
2030
2031 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2032 {
2033 /*
2034 we have a problem here: by default, WideCharToMultiByte() may
2035 replace characters unrepresentable in the target code page with bad
2036 quality approximations such as turning "1/2" symbol (U+00BD) into
2037 "1" for the code pages which don't have it and we, obviously, want
2038 to avoid this at any price
2039
2040 the trouble is that this function does it _silently_, i.e. it won't
2041 even tell us whether it did or not... Win98/2000 and higher provide
2042 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2043 we have to resort to a round trip, i.e. check that converting back
2044 results in the same string -- this is, of course, expensive but
2045 otherwise we simply can't be sure to not garble the data.
2046 */
2047
2048 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2049 // it doesn't work with CJK encodings (which we test for rather roughly
2050 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2051 // supporting it
2052 BOOL usedDef wxDUMMY_INITIALIZE(false);
2053 BOOL *pUsedDef;
2054 int flags;
2055 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2056 {
2057 // it's our lucky day
2058 flags = WC_NO_BEST_FIT_CHARS;
2059 pUsedDef = &usedDef;
2060 }
2061 else // old system or unsupported encoding
2062 {
2063 flags = 0;
2064 pUsedDef = NULL;
2065 }
2066
2067 const size_t len = ::WideCharToMultiByte
2068 (
2069 m_CodePage, // code page
2070 flags, // either none or no best fit
2071 pwz, // input string
2072 -1, // it is (wide) NUL-terminated
2073 buf, // output buffer
2074 buf ? n : 0, // and its size
2075 NULL, // default "replacement" char
2076 pUsedDef // [out] was it used?
2077 );
2078
2079 if ( !len )
2080 {
2081 // function totally failed
2082 return wxCONV_FAILED;
2083 }
2084
2085 // if we were really converting, check if we succeeded
2086 if ( buf )
2087 {
2088 if ( flags )
2089 {
2090 // check if the conversion failed, i.e. if any replacements
2091 // were done
2092 if ( usedDef )
2093 return wxCONV_FAILED;
2094 }
2095 else // we must resort to double tripping...
2096 {
2097 wxWCharBuffer wcBuf(n);
2098 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2099 wcscmp(wcBuf, pwz) != 0 )
2100 {
2101 // we didn't obtain the same thing we started from, hence
2102 // the conversion was lossy and we consider that it failed
2103 return wxCONV_FAILED;
2104 }
2105 }
2106 }
2107
2108 // see the comment above for the reason of "len - 1"
2109 return len - 1;
2110 }
2111
2112 virtual size_t GetMBNulLen() const
2113 {
2114 if ( m_minMBCharWidth == 0 )
2115 {
2116 int len = ::WideCharToMultiByte
2117 (
2118 m_CodePage, // code page
2119 0, // no flags
2120 L"", // input string
2121 1, // translate just the NUL
2122 NULL, // output buffer
2123 0, // and its size
2124 NULL, // no replacement char
2125 NULL // [out] don't care if it was used
2126 );
2127
2128 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2129 switch ( len )
2130 {
2131 default:
2132 wxLogDebug(_T("Unexpected NUL length %d"), len);
2133 // fall through
2134
2135 case 0:
2136 self->m_minMBCharWidth = (size_t)-1;
2137 break;
2138
2139 case 1:
2140 case 2:
2141 case 4:
2142 self->m_minMBCharWidth = len;
2143 break;
2144 }
2145 }
2146
2147 return m_minMBCharWidth;
2148 }
2149
2150 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2151
2152 bool IsOk() const { return m_CodePage != -1; }
2153
2154 private:
2155 static bool CanUseNoBestFit()
2156 {
2157 static int s_isWin98Or2k = -1;
2158
2159 if ( s_isWin98Or2k == -1 )
2160 {
2161 int verMaj, verMin;
2162 switch ( wxGetOsVersion(&verMaj, &verMin) )
2163 {
2164 case wxWIN95:
2165 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2166 break;
2167
2168 case wxWINDOWS_NT:
2169 s_isWin98Or2k = verMaj >= 5;
2170 break;
2171
2172 default:
2173 // unknown, be conseravtive by default
2174 s_isWin98Or2k = 0;
2175 }
2176
2177 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2178 }
2179
2180 return s_isWin98Or2k == 1;
2181 }
2182
2183 static bool IsAtLeastWin2kSP4()
2184 {
2185 #ifdef __WXWINCE__
2186 return false;
2187 #else
2188 static int s_isAtLeastWin2kSP4 = -1;
2189
2190 if ( s_isAtLeastWin2kSP4 == -1 )
2191 {
2192 OSVERSIONINFOEX ver;
2193
2194 memset(&ver, 0, sizeof(ver));
2195 ver.dwOSVersionInfoSize = sizeof(ver);
2196 GetVersionEx((OSVERSIONINFO*)&ver);
2197
2198 s_isAtLeastWin2kSP4 =
2199 ((ver.dwMajorVersion > 5) || // Vista+
2200 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2201 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2202 ver.wServicePackMajor >= 4)) // 2000 SP4+
2203 ? 1 : 0;
2204 }
2205
2206 return s_isAtLeastWin2kSP4 == 1;
2207 #endif
2208 }
2209
2210
2211 // the code page we're working with
2212 long m_CodePage;
2213
2214 // cached result of GetMBNulLen(), set to 0 initially meaning
2215 // "unknown"
2216 size_t m_minMBCharWidth;
2217 };
2218
2219 #endif // wxHAVE_WIN32_MB2WC
2220
2221 // ============================================================================
2222 // Cocoa conversion classes
2223 // ============================================================================
2224
2225 #if defined(__WXCOCOA__)
2226
2227 // RN: There is no UTF-32 support in either Core Foundation or
2228 // Cocoa. Strangely enough, internally Core Foundation uses
2229 // UTF 32 internally quite a bit - its just not public (yet).
2230
2231 #include <CoreFoundation/CFString.h>
2232 #include <CoreFoundation/CFStringEncodingExt.h>
2233
2234 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2235 {
2236 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2237 if ( encoding == wxFONTENCODING_DEFAULT )
2238 {
2239 enc = CFStringGetSystemEncoding();
2240 }
2241 else switch( encoding)
2242 {
2243 case wxFONTENCODING_ISO8859_1 :
2244 enc = kCFStringEncodingISOLatin1 ;
2245 break ;
2246 case wxFONTENCODING_ISO8859_2 :
2247 enc = kCFStringEncodingISOLatin2;
2248 break ;
2249 case wxFONTENCODING_ISO8859_3 :
2250 enc = kCFStringEncodingISOLatin3 ;
2251 break ;
2252 case wxFONTENCODING_ISO8859_4 :
2253 enc = kCFStringEncodingISOLatin4;
2254 break ;
2255 case wxFONTENCODING_ISO8859_5 :
2256 enc = kCFStringEncodingISOLatinCyrillic;
2257 break ;
2258 case wxFONTENCODING_ISO8859_6 :
2259 enc = kCFStringEncodingISOLatinArabic;
2260 break ;
2261 case wxFONTENCODING_ISO8859_7 :
2262 enc = kCFStringEncodingISOLatinGreek;
2263 break ;
2264 case wxFONTENCODING_ISO8859_8 :
2265 enc = kCFStringEncodingISOLatinHebrew;
2266 break ;
2267 case wxFONTENCODING_ISO8859_9 :
2268 enc = kCFStringEncodingISOLatin5;
2269 break ;
2270 case wxFONTENCODING_ISO8859_10 :
2271 enc = kCFStringEncodingISOLatin6;
2272 break ;
2273 case wxFONTENCODING_ISO8859_11 :
2274 enc = kCFStringEncodingISOLatinThai;
2275 break ;
2276 case wxFONTENCODING_ISO8859_13 :
2277 enc = kCFStringEncodingISOLatin7;
2278 break ;
2279 case wxFONTENCODING_ISO8859_14 :
2280 enc = kCFStringEncodingISOLatin8;
2281 break ;
2282 case wxFONTENCODING_ISO8859_15 :
2283 enc = kCFStringEncodingISOLatin9;
2284 break ;
2285
2286 case wxFONTENCODING_KOI8 :
2287 enc = kCFStringEncodingKOI8_R;
2288 break ;
2289 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2290 enc = kCFStringEncodingDOSRussian;
2291 break ;
2292
2293 // case wxFONTENCODING_BULGARIAN :
2294 // enc = ;
2295 // break ;
2296
2297 case wxFONTENCODING_CP437 :
2298 enc =kCFStringEncodingDOSLatinUS ;
2299 break ;
2300 case wxFONTENCODING_CP850 :
2301 enc = kCFStringEncodingDOSLatin1;
2302 break ;
2303 case wxFONTENCODING_CP852 :
2304 enc = kCFStringEncodingDOSLatin2;
2305 break ;
2306 case wxFONTENCODING_CP855 :
2307 enc = kCFStringEncodingDOSCyrillic;
2308 break ;
2309 case wxFONTENCODING_CP866 :
2310 enc =kCFStringEncodingDOSRussian ;
2311 break ;
2312 case wxFONTENCODING_CP874 :
2313 enc = kCFStringEncodingDOSThai;
2314 break ;
2315 case wxFONTENCODING_CP932 :
2316 enc = kCFStringEncodingDOSJapanese;
2317 break ;
2318 case wxFONTENCODING_CP936 :
2319 enc =kCFStringEncodingDOSChineseSimplif ;
2320 break ;
2321 case wxFONTENCODING_CP949 :
2322 enc = kCFStringEncodingDOSKorean;
2323 break ;
2324 case wxFONTENCODING_CP950 :
2325 enc = kCFStringEncodingDOSChineseTrad;
2326 break ;
2327 case wxFONTENCODING_CP1250 :
2328 enc = kCFStringEncodingWindowsLatin2;
2329 break ;
2330 case wxFONTENCODING_CP1251 :
2331 enc =kCFStringEncodingWindowsCyrillic ;
2332 break ;
2333 case wxFONTENCODING_CP1252 :
2334 enc =kCFStringEncodingWindowsLatin1 ;
2335 break ;
2336 case wxFONTENCODING_CP1253 :
2337 enc = kCFStringEncodingWindowsGreek;
2338 break ;
2339 case wxFONTENCODING_CP1254 :
2340 enc = kCFStringEncodingWindowsLatin5;
2341 break ;
2342 case wxFONTENCODING_CP1255 :
2343 enc =kCFStringEncodingWindowsHebrew ;
2344 break ;
2345 case wxFONTENCODING_CP1256 :
2346 enc =kCFStringEncodingWindowsArabic ;
2347 break ;
2348 case wxFONTENCODING_CP1257 :
2349 enc = kCFStringEncodingWindowsBalticRim;
2350 break ;
2351 // This only really encodes to UTF7 (if that) evidently
2352 // case wxFONTENCODING_UTF7 :
2353 // enc = kCFStringEncodingNonLossyASCII ;
2354 // break ;
2355 case wxFONTENCODING_UTF8 :
2356 enc = kCFStringEncodingUTF8 ;
2357 break ;
2358 case wxFONTENCODING_EUC_JP :
2359 enc = kCFStringEncodingEUC_JP;
2360 break ;
2361 case wxFONTENCODING_UTF16 :
2362 enc = kCFStringEncodingUnicode ;
2363 break ;
2364 case wxFONTENCODING_MACROMAN :
2365 enc = kCFStringEncodingMacRoman ;
2366 break ;
2367 case wxFONTENCODING_MACJAPANESE :
2368 enc = kCFStringEncodingMacJapanese ;
2369 break ;
2370 case wxFONTENCODING_MACCHINESETRAD :
2371 enc = kCFStringEncodingMacChineseTrad ;
2372 break ;
2373 case wxFONTENCODING_MACKOREAN :
2374 enc = kCFStringEncodingMacKorean ;
2375 break ;
2376 case wxFONTENCODING_MACARABIC :
2377 enc = kCFStringEncodingMacArabic ;
2378 break ;
2379 case wxFONTENCODING_MACHEBREW :
2380 enc = kCFStringEncodingMacHebrew ;
2381 break ;
2382 case wxFONTENCODING_MACGREEK :
2383 enc = kCFStringEncodingMacGreek ;
2384 break ;
2385 case wxFONTENCODING_MACCYRILLIC :
2386 enc = kCFStringEncodingMacCyrillic ;
2387 break ;
2388 case wxFONTENCODING_MACDEVANAGARI :
2389 enc = kCFStringEncodingMacDevanagari ;
2390 break ;
2391 case wxFONTENCODING_MACGURMUKHI :
2392 enc = kCFStringEncodingMacGurmukhi ;
2393 break ;
2394 case wxFONTENCODING_MACGUJARATI :
2395 enc = kCFStringEncodingMacGujarati ;
2396 break ;
2397 case wxFONTENCODING_MACORIYA :
2398 enc = kCFStringEncodingMacOriya ;
2399 break ;
2400 case wxFONTENCODING_MACBENGALI :
2401 enc = kCFStringEncodingMacBengali ;
2402 break ;
2403 case wxFONTENCODING_MACTAMIL :
2404 enc = kCFStringEncodingMacTamil ;
2405 break ;
2406 case wxFONTENCODING_MACTELUGU :
2407 enc = kCFStringEncodingMacTelugu ;
2408 break ;
2409 case wxFONTENCODING_MACKANNADA :
2410 enc = kCFStringEncodingMacKannada ;
2411 break ;
2412 case wxFONTENCODING_MACMALAJALAM :
2413 enc = kCFStringEncodingMacMalayalam ;
2414 break ;
2415 case wxFONTENCODING_MACSINHALESE :
2416 enc = kCFStringEncodingMacSinhalese ;
2417 break ;
2418 case wxFONTENCODING_MACBURMESE :
2419 enc = kCFStringEncodingMacBurmese ;
2420 break ;
2421 case wxFONTENCODING_MACKHMER :
2422 enc = kCFStringEncodingMacKhmer ;
2423 break ;
2424 case wxFONTENCODING_MACTHAI :
2425 enc = kCFStringEncodingMacThai ;
2426 break ;
2427 case wxFONTENCODING_MACLAOTIAN :
2428 enc = kCFStringEncodingMacLaotian ;
2429 break ;
2430 case wxFONTENCODING_MACGEORGIAN :
2431 enc = kCFStringEncodingMacGeorgian ;
2432 break ;
2433 case wxFONTENCODING_MACARMENIAN :
2434 enc = kCFStringEncodingMacArmenian ;
2435 break ;
2436 case wxFONTENCODING_MACCHINESESIMP :
2437 enc = kCFStringEncodingMacChineseSimp ;
2438 break ;
2439 case wxFONTENCODING_MACTIBETAN :
2440 enc = kCFStringEncodingMacTibetan ;
2441 break ;
2442 case wxFONTENCODING_MACMONGOLIAN :
2443 enc = kCFStringEncodingMacMongolian ;
2444 break ;
2445 case wxFONTENCODING_MACETHIOPIC :
2446 enc = kCFStringEncodingMacEthiopic ;
2447 break ;
2448 case wxFONTENCODING_MACCENTRALEUR :
2449 enc = kCFStringEncodingMacCentralEurRoman ;
2450 break ;
2451 case wxFONTENCODING_MACVIATNAMESE :
2452 enc = kCFStringEncodingMacVietnamese ;
2453 break ;
2454 case wxFONTENCODING_MACARABICEXT :
2455 enc = kCFStringEncodingMacExtArabic ;
2456 break ;
2457 case wxFONTENCODING_MACSYMBOL :
2458 enc = kCFStringEncodingMacSymbol ;
2459 break ;
2460 case wxFONTENCODING_MACDINGBATS :
2461 enc = kCFStringEncodingMacDingbats ;
2462 break ;
2463 case wxFONTENCODING_MACTURKISH :
2464 enc = kCFStringEncodingMacTurkish ;
2465 break ;
2466 case wxFONTENCODING_MACCROATIAN :
2467 enc = kCFStringEncodingMacCroatian ;
2468 break ;
2469 case wxFONTENCODING_MACICELANDIC :
2470 enc = kCFStringEncodingMacIcelandic ;
2471 break ;
2472 case wxFONTENCODING_MACROMANIAN :
2473 enc = kCFStringEncodingMacRomanian ;
2474 break ;
2475 case wxFONTENCODING_MACCELTIC :
2476 enc = kCFStringEncodingMacCeltic ;
2477 break ;
2478 case wxFONTENCODING_MACGAELIC :
2479 enc = kCFStringEncodingMacGaelic ;
2480 break ;
2481 // case wxFONTENCODING_MACKEYBOARD :
2482 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2483 // break ;
2484 default :
2485 // because gcc is picky
2486 break ;
2487 } ;
2488 return enc ;
2489 }
2490
2491 class wxMBConv_cocoa : public wxMBConv
2492 {
2493 public:
2494 wxMBConv_cocoa()
2495 {
2496 Init(CFStringGetSystemEncoding()) ;
2497 }
2498
2499 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2500 {
2501 m_encoding = conv.m_encoding;
2502 }
2503
2504 #if wxUSE_FONTMAP
2505 wxMBConv_cocoa(const wxChar* name)
2506 {
2507 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2508 }
2509 #endif
2510
2511 wxMBConv_cocoa(wxFontEncoding encoding)
2512 {
2513 Init( wxCFStringEncFromFontEnc(encoding) );
2514 }
2515
2516 ~wxMBConv_cocoa()
2517 {
2518 }
2519
2520 void Init( CFStringEncoding encoding)
2521 {
2522 m_encoding = encoding ;
2523 }
2524
2525 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2526 {
2527 wxASSERT(szUnConv);
2528
2529 CFStringRef theString = CFStringCreateWithBytes (
2530 NULL, //the allocator
2531 (const UInt8*)szUnConv,
2532 strlen(szUnConv),
2533 m_encoding,
2534 false //no BOM/external representation
2535 );
2536
2537 wxASSERT(theString);
2538
2539 size_t nOutLength = CFStringGetLength(theString);
2540
2541 if (szOut == NULL)
2542 {
2543 CFRelease(theString);
2544 return nOutLength;
2545 }
2546
2547 CFRange theRange = { 0, nOutSize };
2548
2549 #if SIZEOF_WCHAR_T == 4
2550 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2551 #endif
2552
2553 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2554
2555 CFRelease(theString);
2556
2557 szUniCharBuffer[nOutLength] = '\0' ;
2558
2559 #if SIZEOF_WCHAR_T == 4
2560 wxMBConvUTF16 converter ;
2561 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2562 delete[] szUniCharBuffer;
2563 #endif
2564
2565 return nOutLength;
2566 }
2567
2568 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2569 {
2570 wxASSERT(szUnConv);
2571
2572 size_t nRealOutSize;
2573 size_t nBufSize = wxWcslen(szUnConv);
2574 UniChar* szUniBuffer = (UniChar*) szUnConv;
2575
2576 #if SIZEOF_WCHAR_T == 4
2577 wxMBConvUTF16 converter ;
2578 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2579 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2580 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2581 nBufSize /= sizeof(UniChar);
2582 #endif
2583
2584 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2585 NULL, //allocator
2586 szUniBuffer,
2587 nBufSize,
2588 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2589 );
2590
2591 wxASSERT(theString);
2592
2593 //Note that CER puts a BOM when converting to unicode
2594 //so we check and use getchars instead in that case
2595 if (m_encoding == kCFStringEncodingUnicode)
2596 {
2597 if (szOut != NULL)
2598 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2599
2600 nRealOutSize = CFStringGetLength(theString) + 1;
2601 }
2602 else
2603 {
2604 CFStringGetBytes(
2605 theString,
2606 CFRangeMake(0, CFStringGetLength(theString)),
2607 m_encoding,
2608 0, //what to put in characters that can't be converted -
2609 //0 tells CFString to return NULL if it meets such a character
2610 false, //not an external representation
2611 (UInt8*) szOut,
2612 nOutSize,
2613 (CFIndex*) &nRealOutSize
2614 );
2615 }
2616
2617 CFRelease(theString);
2618
2619 #if SIZEOF_WCHAR_T == 4
2620 delete[] szUniBuffer;
2621 #endif
2622
2623 return nRealOutSize - 1;
2624 }
2625
2626 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2627
2628 bool IsOk() const
2629 {
2630 return m_encoding != kCFStringEncodingInvalidId &&
2631 CFStringIsEncodingAvailable(m_encoding);
2632 }
2633
2634 private:
2635 CFStringEncoding m_encoding ;
2636 };
2637
2638 #endif // defined(__WXCOCOA__)
2639
2640 // ============================================================================
2641 // Mac conversion classes
2642 // ============================================================================
2643
2644 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2645
2646 class wxMBConv_mac : public wxMBConv
2647 {
2648 public:
2649 wxMBConv_mac()
2650 {
2651 Init(CFStringGetSystemEncoding()) ;
2652 }
2653
2654 wxMBConv_mac(const wxMBConv_mac& conv)
2655 {
2656 Init(conv.m_char_encoding);
2657 }
2658
2659 #if wxUSE_FONTMAP
2660 wxMBConv_mac(const wxChar* name)
2661 {
2662 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2663 }
2664 #endif
2665
2666 wxMBConv_mac(wxFontEncoding encoding)
2667 {
2668 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2669 }
2670
2671 ~wxMBConv_mac()
2672 {
2673 OSStatus status = noErr ;
2674 status = TECDisposeConverter(m_MB2WC_converter);
2675 status = TECDisposeConverter(m_WC2MB_converter);
2676 }
2677
2678
2679 void Init( TextEncodingBase encoding)
2680 {
2681 OSStatus status = noErr ;
2682 m_char_encoding = encoding ;
2683 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2684
2685 status = TECCreateConverter(&m_MB2WC_converter,
2686 m_char_encoding,
2687 m_unicode_encoding);
2688 status = TECCreateConverter(&m_WC2MB_converter,
2689 m_unicode_encoding,
2690 m_char_encoding);
2691 }
2692
2693 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2694 {
2695 OSStatus status = noErr ;
2696 ByteCount byteOutLen ;
2697 ByteCount byteInLen = strlen(psz) ;
2698 wchar_t *tbuf = NULL ;
2699 UniChar* ubuf = NULL ;
2700 size_t res = 0 ;
2701
2702 if (buf == NULL)
2703 {
2704 //apple specs say at least 32
2705 n = wxMax( 32 , byteInLen ) ;
2706 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2707 }
2708 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2709 #if SIZEOF_WCHAR_T == 4
2710 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2711 #else
2712 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2713 #endif
2714 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2715 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2716 #if SIZEOF_WCHAR_T == 4
2717 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2718 // is not properly terminated we get random characters at the end
2719 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2720 wxMBConvUTF16 converter ;
2721 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2722 free( ubuf ) ;
2723 #else
2724 res = byteOutLen / sizeof( UniChar ) ;
2725 #endif
2726 if ( buf == NULL )
2727 free(tbuf) ;
2728
2729 if ( buf && res < n)
2730 buf[res] = 0;
2731
2732 return res ;
2733 }
2734
2735 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2736 {
2737 OSStatus status = noErr ;
2738 ByteCount byteOutLen ;
2739 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2740
2741 char *tbuf = NULL ;
2742
2743 if (buf == NULL)
2744 {
2745 //apple specs say at least 32
2746 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2747 tbuf = (char*) malloc( n ) ;
2748 }
2749
2750 ByteCount byteBufferLen = n ;
2751 UniChar* ubuf = NULL ;
2752 #if SIZEOF_WCHAR_T == 4
2753 wxMBConvUTF16 converter ;
2754 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2755 byteInLen = unicharlen ;
2756 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2757 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2758 #else
2759 ubuf = (UniChar*) psz ;
2760 #endif
2761 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2762 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2763 #if SIZEOF_WCHAR_T == 4
2764 free( ubuf ) ;
2765 #endif
2766 if ( buf == NULL )
2767 free(tbuf) ;
2768
2769 size_t res = byteOutLen ;
2770 if ( buf && res < n)
2771 {
2772 buf[res] = 0;
2773
2774 //we need to double-trip to verify it didn't insert any ? in place
2775 //of bogus characters
2776 wxWCharBuffer wcBuf(n);
2777 size_t pszlen = wxWcslen(psz);
2778 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2779 wxWcslen(wcBuf) != pszlen ||
2780 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2781 {
2782 // we didn't obtain the same thing we started from, hence
2783 // the conversion was lossy and we consider that it failed
2784 return wxCONV_FAILED;
2785 }
2786 }
2787
2788 return res ;
2789 }
2790
2791 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2792
2793 bool IsOk() const
2794 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2795
2796 private:
2797 TECObjectRef m_MB2WC_converter ;
2798 TECObjectRef m_WC2MB_converter ;
2799
2800 TextEncodingBase m_char_encoding ;
2801 TextEncodingBase m_unicode_encoding ;
2802 };
2803
2804 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2805
2806 // ============================================================================
2807 // wxEncodingConverter based conversion classes
2808 // ============================================================================
2809
2810 #if wxUSE_FONTMAP
2811
2812 class wxMBConv_wxwin : public wxMBConv
2813 {
2814 private:
2815 void Init()
2816 {
2817 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2818 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2819 }
2820
2821 public:
2822 // temporarily just use wxEncodingConverter stuff,
2823 // so that it works while a better implementation is built
2824 wxMBConv_wxwin(const wxChar* name)
2825 {
2826 if (name)
2827 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2828 else
2829 m_enc = wxFONTENCODING_SYSTEM;
2830
2831 Init();
2832 }
2833
2834 wxMBConv_wxwin(wxFontEncoding enc)
2835 {
2836 m_enc = enc;
2837
2838 Init();
2839 }
2840
2841 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2842 {
2843 size_t inbuf = strlen(psz);
2844 if (buf)
2845 {
2846 if (!m2w.Convert(psz,buf))
2847 return wxCONV_FAILED;
2848 }
2849 return inbuf;
2850 }
2851
2852 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2853 {
2854 const size_t inbuf = wxWcslen(psz);
2855 if (buf)
2856 {
2857 if (!w2m.Convert(psz,buf))
2858 return wxCONV_FAILED;
2859 }
2860
2861 return inbuf;
2862 }
2863
2864 virtual size_t GetMBNulLen() const
2865 {
2866 switch ( m_enc )
2867 {
2868 case wxFONTENCODING_UTF16BE:
2869 case wxFONTENCODING_UTF16LE:
2870 return 2;
2871
2872 case wxFONTENCODING_UTF32BE:
2873 case wxFONTENCODING_UTF32LE:
2874 return 4;
2875
2876 default:
2877 return 1;
2878 }
2879 }
2880
2881 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2882
2883 bool IsOk() const { return m_ok; }
2884
2885 public:
2886 wxFontEncoding m_enc;
2887 wxEncodingConverter m2w, w2m;
2888
2889 private:
2890 // were we initialized successfully?
2891 bool m_ok;
2892
2893 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2894 };
2895
2896 // make the constructors available for unit testing
2897 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2898 {
2899 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2900 if ( !result->IsOk() )
2901 {
2902 delete result;
2903 return 0;
2904 }
2905 return result;
2906 }
2907
2908 #endif // wxUSE_FONTMAP
2909
2910 // ============================================================================
2911 // wxCSConv implementation
2912 // ============================================================================
2913
2914 void wxCSConv::Init()
2915 {
2916 m_name = NULL;
2917 m_convReal = NULL;
2918 m_deferred = true;
2919 }
2920
2921 wxCSConv::wxCSConv(const wxChar *charset)
2922 {
2923 Init();
2924
2925 if ( charset )
2926 {
2927 SetName(charset);
2928 }
2929
2930 #if wxUSE_FONTMAP
2931 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2932 #else
2933 m_encoding = wxFONTENCODING_SYSTEM;
2934 #endif
2935 }
2936
2937 wxCSConv::wxCSConv(wxFontEncoding encoding)
2938 {
2939 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2940 {
2941 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2942
2943 encoding = wxFONTENCODING_SYSTEM;
2944 }
2945
2946 Init();
2947
2948 m_encoding = encoding;
2949 }
2950
2951 wxCSConv::~wxCSConv()
2952 {
2953 Clear();
2954 }
2955
2956 wxCSConv::wxCSConv(const wxCSConv& conv)
2957 : wxMBConv()
2958 {
2959 Init();
2960
2961 SetName(conv.m_name);
2962 m_encoding = conv.m_encoding;
2963 }
2964
2965 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2966 {
2967 Clear();
2968
2969 SetName(conv.m_name);
2970 m_encoding = conv.m_encoding;
2971
2972 return *this;
2973 }
2974
2975 void wxCSConv::Clear()
2976 {
2977 free(m_name);
2978 delete m_convReal;
2979
2980 m_name = NULL;
2981 m_convReal = NULL;
2982 }
2983
2984 void wxCSConv::SetName(const wxChar *charset)
2985 {
2986 if (charset)
2987 {
2988 m_name = wxStrdup(charset);
2989 m_deferred = true;
2990 }
2991 }
2992
2993 #if wxUSE_FONTMAP
2994 #include "wx/hashmap.h"
2995
2996 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2997 wxEncodingNameCache );
2998
2999 static wxEncodingNameCache gs_nameCache;
3000 #endif
3001
3002 wxMBConv *wxCSConv::DoCreate() const
3003 {
3004 #if wxUSE_FONTMAP
3005 wxLogTrace(TRACE_STRCONV,
3006 wxT("creating conversion for %s"),
3007 (m_name ? m_name
3008 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3009 #endif // wxUSE_FONTMAP
3010
3011 // check for the special case of ASCII or ISO8859-1 charset: as we have
3012 // special knowledge of it anyhow, we don't need to create a special
3013 // conversion object
3014 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3015 m_encoding == wxFONTENCODING_DEFAULT )
3016 {
3017 // don't convert at all
3018 return NULL;
3019 }
3020
3021 // we trust OS to do conversion better than we can so try external
3022 // conversion methods first
3023 //
3024 // the full order is:
3025 // 1. OS conversion (iconv() under Unix or Win32 API)
3026 // 2. hard coded conversions for UTF
3027 // 3. wxEncodingConverter as fall back
3028
3029 // step (1)
3030 #ifdef HAVE_ICONV
3031 #if !wxUSE_FONTMAP
3032 if ( m_name )
3033 #endif // !wxUSE_FONTMAP
3034 {
3035 wxString name(m_name);
3036 wxFontEncoding encoding(m_encoding);
3037
3038 if ( !name.empty() )
3039 {
3040 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3041 if ( conv->IsOk() )
3042 return conv;
3043
3044 delete conv;
3045
3046 #if wxUSE_FONTMAP
3047 encoding =
3048 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3049 #endif // wxUSE_FONTMAP
3050 }
3051 #if wxUSE_FONTMAP
3052 {
3053 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3054 if ( it != gs_nameCache.end() )
3055 {
3056 if ( it->second.empty() )
3057 return NULL;
3058
3059 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3060 if ( conv->IsOk() )
3061 return conv;
3062
3063 delete conv;
3064 }
3065
3066 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3067
3068 for ( ; *names; ++names )
3069 {
3070 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3071 if ( conv->IsOk() )
3072 {
3073 gs_nameCache[encoding] = *names;
3074 return conv;
3075 }
3076
3077 delete conv;
3078 }
3079
3080 gs_nameCache[encoding] = _T(""); // cache the failure
3081 }
3082 #endif // wxUSE_FONTMAP
3083 }
3084 #endif // HAVE_ICONV
3085
3086 #ifdef wxHAVE_WIN32_MB2WC
3087 {
3088 #if wxUSE_FONTMAP
3089 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3090 : new wxMBConv_win32(m_encoding);
3091 if ( conv->IsOk() )
3092 return conv;
3093
3094 delete conv;
3095 #else
3096 return NULL;
3097 #endif
3098 }
3099 #endif // wxHAVE_WIN32_MB2WC
3100 #if defined(__WXMAC__)
3101 {
3102 // leave UTF16 and UTF32 to the built-ins of wx
3103 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3104 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3105 {
3106
3107 #if wxUSE_FONTMAP
3108 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3109 : new wxMBConv_mac(m_encoding);
3110 #else
3111 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3112 #endif
3113 if ( conv->IsOk() )
3114 return conv;
3115
3116 delete conv;
3117 }
3118 }
3119 #endif
3120 #if defined(__WXCOCOA__)
3121 {
3122 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3123 {
3124
3125 #if wxUSE_FONTMAP
3126 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3127 : new wxMBConv_cocoa(m_encoding);
3128 #else
3129 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3130 #endif
3131 if ( conv->IsOk() )
3132 return conv;
3133
3134 delete conv;
3135 }
3136 }
3137 #endif
3138 // step (2)
3139 wxFontEncoding enc = m_encoding;
3140 #if wxUSE_FONTMAP
3141 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3142 {
3143 // use "false" to suppress interactive dialogs -- we can be called from
3144 // anywhere and popping up a dialog from here is the last thing we want to
3145 // do
3146 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3147 }
3148 #endif // wxUSE_FONTMAP
3149
3150 switch ( enc )
3151 {
3152 case wxFONTENCODING_UTF7:
3153 return new wxMBConvUTF7;
3154
3155 case wxFONTENCODING_UTF8:
3156 return new wxMBConvUTF8;
3157
3158 case wxFONTENCODING_UTF16BE:
3159 return new wxMBConvUTF16BE;
3160
3161 case wxFONTENCODING_UTF16LE:
3162 return new wxMBConvUTF16LE;
3163
3164 case wxFONTENCODING_UTF32BE:
3165 return new wxMBConvUTF32BE;
3166
3167 case wxFONTENCODING_UTF32LE:
3168 return new wxMBConvUTF32LE;
3169
3170 default:
3171 // nothing to do but put here to suppress gcc warnings
3172 ;
3173 }
3174
3175 // step (3)
3176 #if wxUSE_FONTMAP
3177 {
3178 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3179 : new wxMBConv_wxwin(m_encoding);
3180 if ( conv->IsOk() )
3181 return conv;
3182
3183 delete conv;
3184 }
3185 #endif // wxUSE_FONTMAP
3186
3187 // NB: This is a hack to prevent deadlock. What could otherwise happen
3188 // in Unicode build: wxConvLocal creation ends up being here
3189 // because of some failure and logs the error. But wxLog will try to
3190 // attach timestamp, for which it will need wxConvLocal (to convert
3191 // time to char* and then wchar_t*), but that fails, tries to log
3192 // error, but wxLog has a (already locked) critical section that
3193 // guards static buffer.
3194 static bool alreadyLoggingError = false;
3195 if (!alreadyLoggingError)
3196 {
3197 alreadyLoggingError = true;
3198 wxLogError(_("Cannot convert from the charset '%s'!"),
3199 m_name ? m_name
3200 :
3201 #if wxUSE_FONTMAP
3202 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3203 #else // !wxUSE_FONTMAP
3204 wxString::Format(_("encoding %s"), m_encoding).c_str()
3205 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3206 );
3207 alreadyLoggingError = false;
3208 }
3209
3210 return NULL;
3211 }
3212
3213 void wxCSConv::CreateConvIfNeeded() const
3214 {
3215 if ( m_deferred )
3216 {
3217 wxCSConv *self = (wxCSConv *)this; // const_cast
3218
3219 #if wxUSE_INTL
3220 // if we don't have neither the name nor the encoding, use the default
3221 // encoding for this system
3222 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3223 {
3224 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3225 }
3226 #endif // wxUSE_INTL
3227
3228 self->m_convReal = DoCreate();
3229 self->m_deferred = false;
3230 }
3231 }
3232
3233 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3234 {
3235 CreateConvIfNeeded();
3236
3237 if (m_convReal)
3238 return m_convReal->MB2WC(buf, psz, n);
3239
3240 // latin-1 (direct)
3241 size_t len = strlen(psz);
3242
3243 if (buf)
3244 {
3245 for (size_t c = 0; c <= len; c++)
3246 buf[c] = (unsigned char)(psz[c]);
3247 }
3248
3249 return len;
3250 }
3251
3252 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3253 {
3254 CreateConvIfNeeded();
3255
3256 if (m_convReal)
3257 return m_convReal->WC2MB(buf, psz, n);
3258
3259 // latin-1 (direct)
3260 const size_t len = wxWcslen(psz);
3261 if (buf)
3262 {
3263 for (size_t c = 0; c <= len; c++)
3264 {
3265 if (psz[c] > 0xFF)
3266 return wxCONV_FAILED;
3267 buf[c] = (char)psz[c];
3268 }
3269 }
3270 else
3271 {
3272 for (size_t c = 0; c <= len; c++)
3273 {
3274 if (psz[c] > 0xFF)
3275 return wxCONV_FAILED;
3276 }
3277 }
3278
3279 return len;
3280 }
3281
3282 size_t wxCSConv::GetMBNulLen() const
3283 {
3284 CreateConvIfNeeded();
3285
3286 if ( m_convReal )
3287 {
3288 return m_convReal->GetMBNulLen();
3289 }
3290
3291 return 1;
3292 }
3293
3294 // ----------------------------------------------------------------------------
3295 // globals
3296 // ----------------------------------------------------------------------------
3297
3298 #ifdef __WINDOWS__
3299 static wxMBConv_win32 wxConvLibcObj;
3300 #elif defined(__WXMAC__) && !defined(__MACH__)
3301 static wxMBConv_mac wxConvLibcObj ;
3302 #else
3303 static wxMBConvLibc wxConvLibcObj;
3304 #endif
3305
3306 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3307 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3308 static wxMBConvUTF7 wxConvUTF7Obj;
3309 static wxMBConvUTF8 wxConvUTF8Obj;
3310
3311 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3312 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3313 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3314 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3315 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3316 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3317 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3318 #ifdef __WXOSX__
3319 wxConvUTF8Obj;
3320 #else
3321 wxConvLibcObj;
3322 #endif
3323
3324
3325 #else // !wxUSE_WCHAR_T
3326
3327 // stand-ins in absence of wchar_t
3328 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3329 wxConvISO8859_1,
3330 wxConvLocal,
3331 wxConvUTF8;
3332
3333 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T