]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
f4636aaefb3bab5f27e9fd481c9c55e1e628d02c
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 // For compilers that support precompilation, includes "wx.h".
24 #include "wx/wxprec.h"
25
26 #ifdef __BORLANDC__
27 #pragma hdrstop
28 #endif
29
30 #ifndef WX_PRECOMP
31 #include "wx/intl.h"
32 #include "wx/log.h"
33 #endif // WX_PRECOMP
34
35 #include "wx/strconv.h"
36
37 #if wxUSE_WCHAR_T
38
39 #ifdef __WINDOWS__
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #endif
43
44 #ifndef __WXWINCE__
45 #include <errno.h>
46 #endif
47
48 #include <ctype.h>
49 #include <string.h>
50 #include <stdlib.h>
51
52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
53 #define wxHAVE_WIN32_MB2WC
54 #endif // __WIN32__ but !__WXMICROWIN__
55
56 #ifdef __SALFORDC__
57 #include <clib.h>
58 #endif
59
60 #ifdef HAVE_ICONV
61 #include <iconv.h>
62 #include "wx/thread.h"
63 #endif
64
65 #include "wx/encconv.h"
66 #include "wx/fontmap.h"
67 #include "wx/utils.h"
68
69 #ifdef __WXMAC__
70 #ifndef __DARWIN__
71 #include <ATSUnicode.h>
72 #include <TextCommon.h>
73 #include <TextEncodingConverter.h>
74 #endif
75
76 #include "wx/mac/private.h" // includes mac headers
77 #endif
78
79 #define TRACE_STRCONV _T("strconv")
80
81 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
82 // be 4 bytes
83 #if SIZEOF_WCHAR_T == 2
84 #define WC_UTF16
85 #endif
86
87 // ============================================================================
88 // implementation
89 // ============================================================================
90
91 // helper function of cMB2WC(): check if n bytes at this location are all NUL
92 static bool NotAllNULs(const char *p, size_t n)
93 {
94 while ( n && *p++ == '\0' )
95 n--;
96
97 return n != 0;
98 }
99
100 // ----------------------------------------------------------------------------
101 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
102 // ----------------------------------------------------------------------------
103
104 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
105 {
106 if (input<=0xffff)
107 {
108 if (output)
109 *output = (wxUint16) input;
110 return 1;
111 }
112 else if (input>=0x110000)
113 {
114 return wxCONV_FAILED;
115 }
116 else
117 {
118 if (output)
119 {
120 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
121 *output = (wxUint16) ((input&0x3ff)+0xdc00);
122 }
123 return 2;
124 }
125 }
126
127 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
128 {
129 if ((*input<0xd800) || (*input>0xdfff))
130 {
131 output = *input;
132 return 1;
133 }
134 else if ((input[1]<0xdc00) || (input[1]>0xdfff))
135 {
136 output = *input;
137 return wxCONV_FAILED;
138 }
139 else
140 {
141 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
142 return 2;
143 }
144 }
145
146 #ifdef WC_UTF16
147 typedef wchar_t wxDecodeSurrogate_t;
148 #else // !WC_UTF16
149 typedef wxUint16 wxDecodeSurrogate_t;
150 #endif // WC_UTF16/!WC_UTF16
151
152 // returns the next UTF-32 character from the wchar_t buffer and advances the
153 // pointer to the character after this one
154 //
155 // if an invalid character is found, *pSrc is set to NULL, the caller must
156 // check for this
157 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
158 {
159 wxUint32 out;
160 const size_t n = decode_utf16(*pSrc, out);
161 if ( n == wxCONV_FAILED )
162 *pSrc = NULL;
163 else
164 *pSrc += n;
165
166 return out;
167 }
168
169 // ----------------------------------------------------------------------------
170 // wxMBConv
171 // ----------------------------------------------------------------------------
172
173 size_t
174 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
175 const char *src, size_t srcLen) const
176 {
177 // although new conversion classes are supposed to implement this function
178 // directly, the existins ones only implement the old MB2WC() and so, to
179 // avoid to have to rewrite all conversion classes at once, we provide a
180 // default (but not efficient) implementation of this one in terms of the
181 // old function by copying the input to ensure that it's NUL-terminated and
182 // then using MB2WC() to convert it
183
184 // the number of chars [which would be] written to dst [if it were not NULL]
185 size_t dstWritten = 0;
186
187 // the number of NULs terminating this string
188 size_t nulLen wxDUMMY_INITIALIZE(0);
189
190 // if we were not given the input size we just have to assume that the
191 // string is properly terminated as we have no way of knowing how long it
192 // is anyhow, but if we do have the size check whether there are enough
193 // NULs at the end
194 wxCharBuffer bufTmp;
195 const char *srcEnd;
196 if ( srcLen != wxNO_LEN )
197 {
198 // we need to know how to find the end of this string
199 nulLen = GetMBNulLen();
200 if ( nulLen == wxCONV_FAILED )
201 return wxCONV_FAILED;
202
203 // if there are enough NULs we can avoid the copy
204 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
205 {
206 // make a copy in order to properly NUL-terminate the string
207 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
208 char * const p = bufTmp.data();
209 memcpy(p, src, srcLen);
210 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
211 *s = '\0';
212
213 src = bufTmp;
214 }
215
216 srcEnd = src + srcLen;
217 }
218 else // quit after the first loop iteration
219 {
220 srcEnd = NULL;
221 }
222
223 for ( ;; )
224 {
225 // try to convert the current chunk
226 size_t lenChunk = MB2WC(NULL, src, 0);
227 if ( lenChunk == wxCONV_FAILED )
228 return wxCONV_FAILED;
229
230 lenChunk++; // for the L'\0' at the end of this chunk
231
232 dstWritten += lenChunk;
233
234 if ( lenChunk == 1 )
235 {
236 // nothing left in the input string, conversion succeeded
237 break;
238 }
239
240 if ( dst )
241 {
242 if ( dstWritten > dstLen )
243 return wxCONV_FAILED;
244
245 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
246 return wxCONV_FAILED;
247
248 dst += lenChunk;
249 }
250
251 if ( !srcEnd )
252 {
253 // we convert just one chunk in this case as this is the entire
254 // string anyhow
255 break;
256 }
257
258 // advance the input pointer past the end of this chunk
259 while ( NotAllNULs(src, nulLen) )
260 {
261 // notice that we must skip over multiple bytes here as we suppose
262 // that if NUL takes 2 or 4 bytes, then all the other characters do
263 // too and so if advanced by a single byte we might erroneously
264 // detect sequences of NUL bytes in the middle of the input
265 src += nulLen;
266 }
267
268 src += nulLen; // skipping over its terminator as well
269
270 // note that ">=" (and not just "==") is needed here as the terminator
271 // we skipped just above could be inside or just after the buffer
272 // delimited by inEnd
273 if ( src >= srcEnd )
274 break;
275 }
276
277 return dstWritten;
278 }
279
280 size_t
281 wxMBConv::FromWChar(char *dst, size_t dstLen,
282 const wchar_t *src, size_t srcLen) const
283 {
284 // the number of chars [which would be] written to dst [if it were not NULL]
285 size_t dstWritten = 0;
286
287 // make a copy of the input string unless it is already properly
288 // NUL-terminated
289 //
290 // if we don't know its length we have no choice but to assume that it is,
291 // indeed, properly terminated
292 wxWCharBuffer bufTmp;
293 if ( srcLen == wxNO_LEN )
294 {
295 srcLen = wxWcslen(src) + 1;
296 }
297 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
298 {
299 // make a copy in order to properly NUL-terminate the string
300 bufTmp = wxWCharBuffer(srcLen);
301 memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
302 src = bufTmp;
303 }
304
305 const size_t lenNul = GetMBNulLen();
306 for ( const wchar_t * const srcEnd = src + srcLen;
307 src < srcEnd;
308 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
309 {
310 // try to convert the current chunk
311 size_t lenChunk = WC2MB(NULL, src, 0);
312
313 if ( lenChunk == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 lenChunk += lenNul;
317 dstWritten += lenChunk;
318
319 if ( dst )
320 {
321 if ( dstWritten > dstLen )
322 return wxCONV_FAILED;
323
324 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
325 return wxCONV_FAILED;
326
327 dst += lenChunk;
328 }
329 }
330
331 return dstWritten;
332 }
333
334 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
335 {
336 size_t rc = ToWChar(out, outLen, in);
337 if ( rc != wxCONV_FAILED )
338 {
339 // ToWChar() returns the buffer length, i.e. including the trailing
340 // NUL, while this method doesn't take it into account
341 rc--;
342 }
343
344 return rc;
345 }
346
347 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
348 {
349 size_t rc = FromWChar(out, outLen, in);
350 if ( rc != wxCONV_FAILED )
351 {
352 rc -= GetMBNulLen();
353 }
354
355 return rc;
356 }
357
358 wxMBConv::~wxMBConv()
359 {
360 // nothing to do here (necessary for Darwin linking probably)
361 }
362
363 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
364 {
365 if ( psz )
366 {
367 // calculate the length of the buffer needed first
368 const size_t nLen = MB2WC(NULL, psz, 0);
369 if ( nLen != wxCONV_FAILED )
370 {
371 // now do the actual conversion
372 wxWCharBuffer buf(nLen /* +1 added implicitly */);
373
374 // +1 for the trailing NULL
375 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
376 return buf;
377 }
378 }
379
380 return wxWCharBuffer();
381 }
382
383 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
384 {
385 if ( pwz )
386 {
387 const size_t nLen = WC2MB(NULL, pwz, 0);
388 if ( nLen != wxCONV_FAILED )
389 {
390 // extra space for trailing NUL(s)
391 static const size_t extraLen = GetMaxMBNulLen();
392
393 wxCharBuffer buf(nLen + extraLen - 1);
394 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
395 return buf;
396 }
397 }
398
399 return wxCharBuffer();
400 }
401
402 const wxWCharBuffer
403 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
404 {
405 const size_t dstLen = ToWChar(NULL, 0, in, inLen);
406 if ( dstLen != wxCONV_FAILED )
407 {
408 wxWCharBuffer wbuf(dstLen - 1);
409 if ( ToWChar(wbuf.data(), dstLen, in, inLen) != wxCONV_FAILED )
410 {
411 if ( outLen )
412 {
413 *outLen = dstLen;
414 if ( wbuf[dstLen - 1] == L'\0' )
415 (*outLen)--;
416 }
417
418 return wbuf;
419 }
420 }
421
422 if ( outLen )
423 *outLen = 0;
424
425 return wxWCharBuffer();
426 }
427
428 const wxCharBuffer
429 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
430 {
431 const size_t dstLen = FromWChar(NULL, 0, in, inLen);
432 if ( dstLen != wxCONV_FAILED )
433 {
434 wxCharBuffer buf(dstLen - 1);
435 if ( FromWChar(buf.data(), dstLen, in, inLen) != wxCONV_FAILED )
436 {
437 if ( outLen )
438 {
439 *outLen = dstLen;
440
441 const size_t nulLen = GetMBNulLen();
442 if ( !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
443 {
444 // in this case the output is NUL-terminated and we're not
445 // supposed to count NUL
446 (*outLen) -= nulLen;
447 }
448 }
449
450 return buf;
451 }
452 }
453
454 if ( outLen )
455 *outLen = 0;
456
457 return wxCharBuffer();
458 }
459
460 // ----------------------------------------------------------------------------
461 // wxMBConvLibc
462 // ----------------------------------------------------------------------------
463
464 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
465 {
466 return wxMB2WC(buf, psz, n);
467 }
468
469 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
470 {
471 return wxWC2MB(buf, psz, n);
472 }
473
474 // ----------------------------------------------------------------------------
475 // wxConvBrokenFileNames
476 // ----------------------------------------------------------------------------
477
478 #ifdef __UNIX__
479
480 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
481 {
482 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
483 || wxStricmp(charset, _T("UTF8")) == 0 )
484 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
485 else
486 m_conv = new wxCSConv(charset);
487 }
488
489 #endif // __UNIX__
490
491 // ----------------------------------------------------------------------------
492 // UTF-7
493 // ----------------------------------------------------------------------------
494
495 // Implementation (C) 2004 Fredrik Roubert
496
497 //
498 // BASE64 decoding table
499 //
500 static const unsigned char utf7unb64[] =
501 {
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
506 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
508 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
509 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
511 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
512 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
513 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
515 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
516 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
517 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
534 };
535
536 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
537 {
538 size_t len = 0;
539
540 while ( *psz && (!buf || (len < n)) )
541 {
542 unsigned char cc = *psz++;
543 if (cc != '+')
544 {
545 // plain ASCII char
546 if (buf)
547 *buf++ = cc;
548 len++;
549 }
550 else if (*psz == '-')
551 {
552 // encoded plus sign
553 if (buf)
554 *buf++ = cc;
555 len++;
556 psz++;
557 }
558 else // start of BASE64 encoded string
559 {
560 bool lsb, ok;
561 unsigned int d, l;
562 for ( ok = lsb = false, d = 0, l = 0;
563 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
564 psz++ )
565 {
566 d <<= 6;
567 d += cc;
568 for (l += 6; l >= 8; lsb = !lsb)
569 {
570 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
571 if (lsb)
572 {
573 if (buf)
574 *buf++ |= c;
575 len ++;
576 }
577 else
578 {
579 if (buf)
580 *buf = (wchar_t)(c << 8);
581 }
582
583 ok = true;
584 }
585 }
586
587 if ( !ok )
588 {
589 // in valid UTF7 we should have valid characters after '+'
590 return wxCONV_FAILED;
591 }
592
593 if (*psz == '-')
594 psz++;
595 }
596 }
597
598 if ( buf && (len < n) )
599 *buf = '\0';
600
601 return len;
602 }
603
604 //
605 // BASE64 encoding table
606 //
607 static const unsigned char utf7enb64[] =
608 {
609 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
610 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
611 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
612 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
613 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
614 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
615 'w', 'x', 'y', 'z', '0', '1', '2', '3',
616 '4', '5', '6', '7', '8', '9', '+', '/'
617 };
618
619 //
620 // UTF-7 encoding table
621 //
622 // 0 - Set D (directly encoded characters)
623 // 1 - Set O (optional direct characters)
624 // 2 - whitespace characters (optional)
625 // 3 - special characters
626 //
627 static const unsigned char utf7encode[128] =
628 {
629 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
630 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
631 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
633 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
635 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
637 };
638
639 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
640 {
641 size_t len = 0;
642
643 while (*psz && ((!buf) || (len < n)))
644 {
645 wchar_t cc = *psz++;
646 if (cc < 0x80 && utf7encode[cc] < 1)
647 {
648 // plain ASCII char
649 if (buf)
650 *buf++ = (char)cc;
651 len++;
652 }
653 #ifndef WC_UTF16
654 else if (((wxUint32)cc) > 0xffff)
655 {
656 // no surrogate pair generation (yet?)
657 return wxCONV_FAILED;
658 }
659 #endif
660 else
661 {
662 if (buf)
663 *buf++ = '+';
664 len++;
665 if (cc != '+')
666 {
667 // BASE64 encode string
668 unsigned int lsb, d, l;
669 for (d = 0, l = 0; /*nothing*/; psz++)
670 {
671 for (lsb = 0; lsb < 2; lsb ++)
672 {
673 d <<= 8;
674 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
675
676 for (l += 8; l >= 6; )
677 {
678 l -= 6;
679 if (buf)
680 *buf++ = utf7enb64[(d >> l) % 64];
681 len++;
682 }
683 }
684 cc = *psz;
685 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
686 break;
687 }
688 if (l != 0)
689 {
690 if (buf)
691 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
692 len++;
693 }
694 }
695 if (buf)
696 *buf++ = '-';
697 len++;
698 }
699 }
700 if (buf && (len < n))
701 *buf = 0;
702 return len;
703 }
704
705 // ----------------------------------------------------------------------------
706 // UTF-8
707 // ----------------------------------------------------------------------------
708
709 static wxUint32 utf8_max[]=
710 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
711
712 // boundaries of the private use area we use to (temporarily) remap invalid
713 // characters invalid in a UTF-8 encoded string
714 const wxUint32 wxUnicodePUA = 0x100000;
715 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
716
717 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
718 {
719 size_t len = 0;
720
721 while (*psz && ((!buf) || (len < n)))
722 {
723 const char *opsz = psz;
724 bool invalid = false;
725 unsigned char cc = *psz++, fc = cc;
726 unsigned cnt;
727 for (cnt = 0; fc & 0x80; cnt++)
728 fc <<= 1;
729 if (!cnt)
730 {
731 // plain ASCII char
732 if (buf)
733 *buf++ = cc;
734 len++;
735
736 // escape the escape character for octal escapes
737 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
738 && cc == '\\' && (!buf || len < n))
739 {
740 if (buf)
741 *buf++ = cc;
742 len++;
743 }
744 }
745 else
746 {
747 cnt--;
748 if (!cnt)
749 {
750 // invalid UTF-8 sequence
751 invalid = true;
752 }
753 else
754 {
755 unsigned ocnt = cnt - 1;
756 wxUint32 res = cc & (0x3f >> cnt);
757 while (cnt--)
758 {
759 cc = *psz;
760 if ((cc & 0xC0) != 0x80)
761 {
762 // invalid UTF-8 sequence
763 invalid = true;
764 break;
765 }
766 psz++;
767 res = (res << 6) | (cc & 0x3f);
768 }
769 if (invalid || res <= utf8_max[ocnt])
770 {
771 // illegal UTF-8 encoding
772 invalid = true;
773 }
774 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
775 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
776 {
777 // if one of our PUA characters turns up externally
778 // it must also be treated as an illegal sequence
779 // (a bit like you have to escape an escape character)
780 invalid = true;
781 }
782 else
783 {
784 #ifdef WC_UTF16
785 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
786 size_t pa = encode_utf16(res, (wxUint16 *)buf);
787 if (pa == wxCONV_FAILED)
788 {
789 invalid = true;
790 }
791 else
792 {
793 if (buf)
794 buf += pa;
795 len += pa;
796 }
797 #else // !WC_UTF16
798 if (buf)
799 *buf++ = (wchar_t)res;
800 len++;
801 #endif // WC_UTF16/!WC_UTF16
802 }
803 }
804 if (invalid)
805 {
806 if (m_options & MAP_INVALID_UTF8_TO_PUA)
807 {
808 while (opsz < psz && (!buf || len < n))
809 {
810 #ifdef WC_UTF16
811 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
812 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
813 wxASSERT(pa != wxCONV_FAILED);
814 if (buf)
815 buf += pa;
816 opsz++;
817 len += pa;
818 #else
819 if (buf)
820 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
821 opsz++;
822 len++;
823 #endif
824 }
825 }
826 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
827 {
828 while (opsz < psz && (!buf || len < n))
829 {
830 if ( buf && len + 3 < n )
831 {
832 unsigned char on = *opsz;
833 *buf++ = L'\\';
834 *buf++ = (wchar_t)( L'0' + on / 0100 );
835 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
836 *buf++ = (wchar_t)( L'0' + on % 010 );
837 }
838 opsz++;
839 len += 4;
840 }
841 }
842 else // MAP_INVALID_UTF8_NOT
843 {
844 return wxCONV_FAILED;
845 }
846 }
847 }
848 }
849 if (buf && (len < n))
850 *buf = 0;
851 return len;
852 }
853
854 static inline bool isoctal(wchar_t wch)
855 {
856 return L'0' <= wch && wch <= L'7';
857 }
858
859 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
860 {
861 size_t len = 0;
862
863 while (*psz && ((!buf) || (len < n)))
864 {
865 wxUint32 cc;
866 #ifdef WC_UTF16
867 // cast is ok for WC_UTF16
868 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
869 psz += (pa == wxCONV_FAILED) ? 1 : pa;
870 #else
871 cc=(*psz++) & 0x7fffffff;
872 #endif
873
874 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
875 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
876 {
877 if (buf)
878 *buf++ = (char)(cc - wxUnicodePUA);
879 len++;
880 }
881 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
882 && cc == L'\\' && psz[0] == L'\\' )
883 {
884 if (buf)
885 *buf++ = (char)cc;
886 psz++;
887 len++;
888 }
889 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
890 cc == L'\\' &&
891 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
892 {
893 if (buf)
894 {
895 *buf++ = (char) ((psz[0] - L'0')*0100 +
896 (psz[1] - L'0')*010 +
897 (psz[2] - L'0'));
898 }
899
900 psz += 3;
901 len++;
902 }
903 else
904 {
905 unsigned cnt;
906 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
907 if (!cnt)
908 {
909 // plain ASCII char
910 if (buf)
911 *buf++ = (char) cc;
912 len++;
913 }
914
915 else
916 {
917 len += cnt + 1;
918 if (buf)
919 {
920 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
921 while (cnt--)
922 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
923 }
924 }
925 }
926 }
927
928 if (buf && (len<n))
929 *buf = 0;
930
931 return len;
932 }
933
934 // ============================================================================
935 // UTF-16
936 // ============================================================================
937
938 #ifdef WORDS_BIGENDIAN
939 #define wxMBConvUTF16straight wxMBConvUTF16BE
940 #define wxMBConvUTF16swap wxMBConvUTF16LE
941 #else
942 #define wxMBConvUTF16swap wxMBConvUTF16BE
943 #define wxMBConvUTF16straight wxMBConvUTF16LE
944 #endif
945
946 /* static */
947 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
948 {
949 if ( srcLen == wxNO_LEN )
950 {
951 // count the number of bytes in input, including the trailing NULs
952 const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
953 for ( srcLen = 1; *in++; srcLen++ )
954 ;
955
956 srcLen *= BYTES_PER_CHAR;
957 }
958 else // we already have the length
959 {
960 // we can only convert an entire number of UTF-16 characters
961 if ( srcLen % BYTES_PER_CHAR )
962 return wxCONV_FAILED;
963 }
964
965 return srcLen;
966 }
967
968 // case when in-memory representation is UTF-16 too
969 #ifdef WC_UTF16
970
971 // ----------------------------------------------------------------------------
972 // conversions without endianness change
973 // ----------------------------------------------------------------------------
974
975 size_t
976 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
977 const char *src, size_t srcLen) const
978 {
979 // set up the scene for using memcpy() (which is presumably more efficient
980 // than copying the bytes one by one)
981 srcLen = GetLength(src, srcLen);
982 if ( srcLen == wxNO_LEN )
983 return wxCONV_FAILED;
984
985 const size_t inLen = srcLen/BYTES_PER_CHAR;
986 if ( dst )
987 {
988 if ( dstLen < inLen )
989 return wxCONV_FAILED;
990
991 memcpy(dst, src, srcLen);
992 }
993
994 return inLen;
995 }
996
997 size_t
998 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
999 const wchar_t *src, size_t srcLen) const
1000 {
1001 if ( srcLen == wxNO_LEN )
1002 srcLen = wxWcslen(src) + 1;
1003
1004 srcLen *= BYTES_PER_CHAR;
1005
1006 if ( dst )
1007 {
1008 if ( dstLen < srcLen )
1009 return wxCONV_FAILED;
1010
1011 memcpy(dst, src, srcLen);
1012 }
1013
1014 return srcLen;
1015 }
1016
1017 // ----------------------------------------------------------------------------
1018 // endian-reversing conversions
1019 // ----------------------------------------------------------------------------
1020
1021 size_t
1022 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1023 const char *src, size_t srcLen) const
1024 {
1025 srcLen = GetLength(src, srcLen);
1026 if ( srcLen == wxNO_LEN )
1027 return wxCONV_FAILED;
1028
1029 srcLen /= BYTES_PER_CHAR;
1030
1031 if ( dst )
1032 {
1033 if ( dstLen < srcLen )
1034 return wxCONV_FAILED;
1035
1036 const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
1037 for ( size_t n = 0; n < srcLen; n++, in++ )
1038 {
1039 *dst++ = wxUINT16_SWAP_ALWAYS(*in);
1040 }
1041 }
1042
1043 return srcLen;
1044 }
1045
1046 size_t
1047 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1048 const wchar_t *src, size_t srcLen) const
1049 {
1050 if ( srcLen == wxNO_LEN )
1051 srcLen = wxWcslen(src) + 1;
1052
1053 srcLen *= BYTES_PER_CHAR;
1054
1055 if ( dst )
1056 {
1057 if ( dstLen < srcLen )
1058 return wxCONV_FAILED;
1059
1060 wxUint16 *out = wx_reinterpret_cast(wxUint16 *, dst);
1061 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1062 {
1063 *out++ = wxUINT16_SWAP_ALWAYS(*src);
1064 }
1065 }
1066
1067 return srcLen;
1068 }
1069
1070 #else // !WC_UTF16: wchar_t is UTF-32
1071
1072 // ----------------------------------------------------------------------------
1073 // conversions without endianness change
1074 // ----------------------------------------------------------------------------
1075
1076 size_t
1077 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1078 const char *src, size_t srcLen) const
1079 {
1080 srcLen = GetLength(src, srcLen);
1081 if ( srcLen == wxNO_LEN )
1082 return wxCONV_FAILED;
1083
1084 const size_t inLen = srcLen/BYTES_PER_CHAR;
1085 if ( !dst )
1086 {
1087 // optimization: return maximal space which could be needed for this
1088 // string even if the real size could be smaller if the buffer contains
1089 // any surrogates
1090 return inLen;
1091 }
1092
1093 size_t outLen = 0;
1094 const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
1095 for ( const wxUint16 * const inEnd = in + inLen; in < inEnd; )
1096 {
1097 const wxUint32 ch = wxDecodeSurrogate(&in);
1098 if ( !in )
1099 return wxCONV_FAILED;
1100
1101 if ( ++outLen > dstLen )
1102 return wxCONV_FAILED;
1103
1104 *dst++ = ch;
1105 }
1106
1107
1108 return outLen;
1109 }
1110
1111 size_t
1112 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1113 const wchar_t *src, size_t srcLen) const
1114 {
1115 if ( srcLen == wxNO_LEN )
1116 srcLen = wxWcslen(src) + 1;
1117
1118 size_t outLen = 0;
1119 wxUint16 *out = wx_reinterpret_cast(wxUint16 *, dst);
1120 for ( size_t n = 0; n < srcLen; n++ )
1121 {
1122 wxUint16 cc[2];
1123 const size_t numChars = encode_utf16(*src++, cc);
1124 if ( numChars == wxCONV_FAILED )
1125 return wxCONV_FAILED;
1126
1127 outLen += numChars*BYTES_PER_CHAR;
1128 if ( out )
1129 {
1130 if ( outLen > dstLen )
1131 return wxCONV_FAILED;
1132
1133 *out++ = cc[0];
1134 if ( numChars == 2 )
1135 {
1136 // second character of a surrogate
1137 *out++ = cc[1];
1138 }
1139 }
1140 }
1141
1142 return outLen;
1143 }
1144
1145 // ----------------------------------------------------------------------------
1146 // endian-reversing conversions
1147 // ----------------------------------------------------------------------------
1148
1149 size_t
1150 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1151 const char *src, size_t srcLen) const
1152 {
1153 srcLen = GetLength(src, srcLen);
1154 if ( srcLen == wxNO_LEN )
1155 return wxCONV_FAILED;
1156
1157 const size_t inLen = srcLen/BYTES_PER_CHAR;
1158 if ( !dst )
1159 {
1160 // optimization: return maximal space which could be needed for this
1161 // string even if the real size could be smaller if the buffer contains
1162 // any surrogates
1163 return inLen;
1164 }
1165
1166 size_t outLen = 0;
1167 const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
1168 for ( const wxUint16 * const inEnd = in + inLen; in < inEnd; )
1169 {
1170 wxUint32 ch;
1171 wxUint16 tmp[2];
1172 tmp[0] = wxUINT16_SWAP_ALWAYS(*in);
1173 in++;
1174 tmp[1] = wxUINT16_SWAP_ALWAYS(*in);
1175
1176 const size_t numChars = decode_utf16(tmp, ch);
1177 if ( numChars == wxCONV_FAILED )
1178 return wxCONV_FAILED;
1179
1180 if ( numChars == 2 )
1181 in++;
1182
1183 if ( ++outLen > dstLen )
1184 return wxCONV_FAILED;
1185
1186 *dst++ = ch;
1187 }
1188
1189
1190 return outLen;
1191 }
1192
1193 size_t
1194 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1195 const wchar_t *src, size_t srcLen) const
1196 {
1197 if ( srcLen == wxNO_LEN )
1198 srcLen = wxWcslen(src) + 1;
1199
1200 size_t outLen = 0;
1201 wxUint16 *out = wx_reinterpret_cast(wxUint16 *, dst);
1202 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1203 {
1204 wxUint16 cc[2];
1205 const size_t numChars = encode_utf16(*src, cc);
1206 if ( numChars == wxCONV_FAILED )
1207 return wxCONV_FAILED;
1208
1209 outLen += numChars*BYTES_PER_CHAR;
1210 if ( out )
1211 {
1212 if ( outLen > dstLen )
1213 return wxCONV_FAILED;
1214
1215 *out++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1216 if ( numChars == 2 )
1217 {
1218 // second character of a surrogate
1219 *out++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1220 }
1221 }
1222 }
1223
1224 return outLen;
1225 }
1226
1227 #endif // WC_UTF16/!WC_UTF16
1228
1229
1230 // ============================================================================
1231 // UTF-32
1232 // ============================================================================
1233
1234 #ifdef WORDS_BIGENDIAN
1235 #define wxMBConvUTF32straight wxMBConvUTF32BE
1236 #define wxMBConvUTF32swap wxMBConvUTF32LE
1237 #else
1238 #define wxMBConvUTF32swap wxMBConvUTF32BE
1239 #define wxMBConvUTF32straight wxMBConvUTF32LE
1240 #endif
1241
1242
1243 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1244 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1245
1246 /* static */
1247 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1248 {
1249 if ( srcLen == wxNO_LEN )
1250 {
1251 // count the number of bytes in input, including the trailing NULs
1252 const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1253 for ( srcLen = 1; *in++; srcLen++ )
1254 ;
1255
1256 srcLen *= BYTES_PER_CHAR;
1257 }
1258 else // we already have the length
1259 {
1260 // we can only convert an entire number of UTF-32 characters
1261 if ( srcLen % BYTES_PER_CHAR )
1262 return wxCONV_FAILED;
1263 }
1264
1265 return srcLen;
1266 }
1267
1268 // case when in-memory representation is UTF-16
1269 #ifdef WC_UTF16
1270
1271 // ----------------------------------------------------------------------------
1272 // conversions without endianness change
1273 // ----------------------------------------------------------------------------
1274
1275 size_t
1276 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1277 const char *src, size_t srcLen) const
1278 {
1279 srcLen = GetLength(src, srcLen);
1280 if ( srcLen == wxNO_LEN )
1281 return wxCONV_FAILED;
1282
1283 const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1284 const size_t inLen = srcLen/BYTES_PER_CHAR;
1285 size_t outLen = 0;
1286 for ( size_t n = 0; n < inLen; n++ )
1287 {
1288 wxUint16 cc[2];
1289 const size_t numChars = encode_utf16(*in++, cc);
1290 if ( numChars == wxCONV_FAILED )
1291 return wxCONV_FAILED;
1292
1293 outLen += numChars;
1294 if ( dst )
1295 {
1296 if ( outLen > dstLen )
1297 return wxCONV_FAILED;
1298
1299 *dst++ = cc[0];
1300 if ( numChars == 2 )
1301 {
1302 // second character of a surrogate
1303 *dst++ = cc[1];
1304 }
1305 }
1306 }
1307
1308 return outLen;
1309 }
1310
1311 size_t
1312 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1313 const wchar_t *src, size_t srcLen) const
1314 {
1315 if ( srcLen == wxNO_LEN )
1316 srcLen = wxWcslen(src) + 1;
1317
1318 if ( !dst )
1319 {
1320 // optimization: return maximal space which could be needed for this
1321 // string instead of the exact amount which could be less if there are
1322 // any surrogates in the input
1323 //
1324 // we consider that surrogates are rare enough to make it worthwhile to
1325 // avoid running the loop below at the cost of slightly extra memory
1326 // consumption
1327 return srcLen*BYTES_PER_CHAR;
1328 }
1329
1330 wxUint32 *out = wx_reinterpret_cast(wxUint32 *, dst);
1331 size_t outLen = 0;
1332 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1333 {
1334 const wxUint32 ch = wxDecodeSurrogate(&src);
1335 if ( !src )
1336 return wxCONV_FAILED;
1337
1338 outLen += BYTES_PER_CHAR;
1339
1340 if ( outLen > dstLen )
1341 return wxCONV_FAILED;
1342
1343 *out++ = ch;
1344 }
1345
1346 return outLen;
1347 }
1348
1349 // ----------------------------------------------------------------------------
1350 // endian-reversing conversions
1351 // ----------------------------------------------------------------------------
1352
1353 size_t
1354 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1355 const char *src, size_t srcLen) const
1356 {
1357 srcLen = GetLength(src, srcLen);
1358 if ( srcLen == wxNO_LEN )
1359 return wxCONV_FAILED;
1360
1361 const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1362 const size_t inLen = srcLen/BYTES_PER_CHAR;
1363 size_t outLen = 0;
1364 for ( size_t n = 0; n < inLen; n++, in++ )
1365 {
1366 wxUint16 cc[2];
1367 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*in), cc);
1368 if ( numChars == wxCONV_FAILED )
1369 return wxCONV_FAILED;
1370
1371 outLen += numChars;
1372 if ( dst )
1373 {
1374 if ( outLen > dstLen )
1375 return wxCONV_FAILED;
1376
1377 *dst++ = cc[0];
1378 if ( numChars == 2 )
1379 {
1380 // second character of a surrogate
1381 *dst++ = cc[1];
1382 }
1383 }
1384 }
1385
1386 return outLen;
1387 }
1388
1389 size_t
1390 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1391 const wchar_t *src, size_t srcLen) const
1392 {
1393 if ( srcLen == wxNO_LEN )
1394 srcLen = wxWcslen(src) + 1;
1395
1396 if ( !dst )
1397 {
1398 // optimization: return maximal space which could be needed for this
1399 // string instead of the exact amount which could be less if there are
1400 // any surrogates in the input
1401 //
1402 // we consider that surrogates are rare enough to make it worthwhile to
1403 // avoid running the loop below at the cost of slightly extra memory
1404 // consumption
1405 return srcLen*BYTES_PER_CHAR;
1406 }
1407
1408 wxUint32 *out = wx_reinterpret_cast(wxUint32 *, dst);
1409 size_t outLen = 0;
1410 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1411 {
1412 const wxUint32 ch = wxDecodeSurrogate(&src);
1413 if ( !src )
1414 return wxCONV_FAILED;
1415
1416 outLen += BYTES_PER_CHAR;
1417
1418 if ( outLen > dstLen )
1419 return wxCONV_FAILED;
1420
1421 *out++ = wxUINT32_SWAP_ALWAYS(ch);
1422 }
1423
1424 return outLen;
1425 }
1426
1427 #else // !WC_UTF16: wchar_t is UTF-32
1428
1429 // ----------------------------------------------------------------------------
1430 // conversions without endianness change
1431 // ----------------------------------------------------------------------------
1432
1433 size_t
1434 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1435 const char *src, size_t srcLen) const
1436 {
1437 // use memcpy() as it should be much faster than hand-written loop
1438 srcLen = GetLength(src, srcLen);
1439 if ( srcLen == wxNO_LEN )
1440 return wxCONV_FAILED;
1441
1442 const size_t inLen = srcLen/BYTES_PER_CHAR;
1443 if ( dst )
1444 {
1445 if ( dstLen < inLen )
1446 return wxCONV_FAILED;
1447
1448 memcpy(dst, src, srcLen);
1449 }
1450
1451 return inLen;
1452 }
1453
1454 size_t
1455 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1456 const wchar_t *src, size_t srcLen) const
1457 {
1458 if ( srcLen == wxNO_LEN )
1459 srcLen = wxWcslen(src) + 1;
1460
1461 srcLen *= BYTES_PER_CHAR;
1462
1463 if ( dst )
1464 {
1465 if ( dstLen < srcLen )
1466 return wxCONV_FAILED;
1467
1468 memcpy(dst, src, srcLen);
1469 }
1470
1471 return srcLen;
1472 }
1473
1474 // ----------------------------------------------------------------------------
1475 // endian-reversing conversions
1476 // ----------------------------------------------------------------------------
1477
1478 size_t
1479 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1480 const char *src, size_t srcLen) const
1481 {
1482 srcLen = GetLength(src, srcLen);
1483 if ( srcLen == wxNO_LEN )
1484 return wxCONV_FAILED;
1485
1486 srcLen /= BYTES_PER_CHAR;
1487
1488 if ( dst )
1489 {
1490 if ( dstLen < srcLen )
1491 return wxCONV_FAILED;
1492
1493 const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1494 for ( size_t n = 0; n < srcLen; n++, in++ )
1495 {
1496 *dst++ = wxUINT32_SWAP_ALWAYS(*in);
1497 }
1498 }
1499
1500 return srcLen;
1501 }
1502
1503 size_t
1504 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1505 const wchar_t *src, size_t srcLen) const
1506 {
1507 if ( srcLen == wxNO_LEN )
1508 srcLen = wxWcslen(src) + 1;
1509
1510 srcLen *= BYTES_PER_CHAR;
1511
1512 if ( dst )
1513 {
1514 if ( dstLen < srcLen )
1515 return wxCONV_FAILED;
1516
1517 wxUint32 *out = wx_reinterpret_cast(wxUint32 *, dst);
1518 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1519 {
1520 *out++ = wxUINT32_SWAP_ALWAYS(*src);
1521 }
1522 }
1523
1524 return srcLen;
1525 }
1526
1527 #endif // WC_UTF16/!WC_UTF16
1528
1529
1530 // ============================================================================
1531 // The classes doing conversion using the iconv_xxx() functions
1532 // ============================================================================
1533
1534 #ifdef HAVE_ICONV
1535
1536 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1537 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1538 // (unless there's yet another bug in glibc) the only case when iconv()
1539 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1540 // left in the input buffer -- when _real_ error occurs,
1541 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1542 // iconv() failure.
1543 // [This bug does not appear in glibc 2.2.]
1544 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1545 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1546 (errno != E2BIG || bufLeft != 0))
1547 #else
1548 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1549 #endif
1550
1551 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1552
1553 #define ICONV_T_INVALID ((iconv_t)-1)
1554
1555 #if SIZEOF_WCHAR_T == 4
1556 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1557 #define WC_ENC wxFONTENCODING_UTF32
1558 #elif SIZEOF_WCHAR_T == 2
1559 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1560 #define WC_ENC wxFONTENCODING_UTF16
1561 #else // sizeof(wchar_t) != 2 nor 4
1562 // does this ever happen?
1563 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1564 #endif
1565
1566 // ----------------------------------------------------------------------------
1567 // wxMBConv_iconv: encapsulates an iconv character set
1568 // ----------------------------------------------------------------------------
1569
1570 class wxMBConv_iconv : public wxMBConv
1571 {
1572 public:
1573 wxMBConv_iconv(const wxChar *name);
1574 virtual ~wxMBConv_iconv();
1575
1576 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1577 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1578
1579 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1580 virtual size_t GetMBNulLen() const;
1581
1582 virtual wxMBConv *Clone() const
1583 {
1584 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1585 p->m_minMBCharWidth = m_minMBCharWidth;
1586 return p;
1587 }
1588
1589 bool IsOk() const
1590 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1591
1592 protected:
1593 // the iconv handlers used to translate from multibyte to wide char and in
1594 // the other direction
1595 iconv_t m2w,
1596 w2m;
1597 #if wxUSE_THREADS
1598 // guards access to m2w and w2m objects
1599 wxMutex m_iconvMutex;
1600 #endif
1601
1602 private:
1603 // the name (for iconv_open()) of a wide char charset -- if none is
1604 // available on this machine, it will remain NULL
1605 static wxString ms_wcCharsetName;
1606
1607 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1608 // different endian-ness than the native one
1609 static bool ms_wcNeedsSwap;
1610
1611
1612 // name of the encoding handled by this conversion
1613 wxString m_name;
1614
1615 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1616 // initially
1617 size_t m_minMBCharWidth;
1618 };
1619
1620 // make the constructor available for unit testing
1621 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1622 {
1623 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1624 if ( !result->IsOk() )
1625 {
1626 delete result;
1627 return 0;
1628 }
1629 return result;
1630 }
1631
1632 wxString wxMBConv_iconv::ms_wcCharsetName;
1633 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1634
1635 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1636 : m_name(name)
1637 {
1638 m_minMBCharWidth = 0;
1639
1640 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1641 // names for the charsets
1642 const wxCharBuffer cname(wxString(name).ToAscii());
1643
1644 // check for charset that represents wchar_t:
1645 if ( ms_wcCharsetName.empty() )
1646 {
1647 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1648
1649 #if wxUSE_FONTMAP
1650 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1651 #else // !wxUSE_FONTMAP
1652 static const wxChar *names[] =
1653 {
1654 #if SIZEOF_WCHAR_T == 4
1655 _T("UCS-4"),
1656 #elif SIZEOF_WCHAR_T = 2
1657 _T("UCS-2"),
1658 #endif
1659 NULL
1660 };
1661 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1662
1663 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1664 {
1665 const wxString nameCS(*names);
1666
1667 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1668 wxString nameXE(nameCS);
1669 #ifdef WORDS_BIGENDIAN
1670 nameXE += _T("BE");
1671 #else // little endian
1672 nameXE += _T("LE");
1673 #endif
1674
1675 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1676 nameXE.c_str());
1677
1678 m2w = iconv_open(nameXE.ToAscii(), cname);
1679 if ( m2w == ICONV_T_INVALID )
1680 {
1681 // try charset w/o bytesex info (e.g. "UCS4")
1682 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1683 nameCS.c_str());
1684 m2w = iconv_open(nameCS.ToAscii(), cname);
1685
1686 // and check for bytesex ourselves:
1687 if ( m2w != ICONV_T_INVALID )
1688 {
1689 char buf[2], *bufPtr;
1690 wchar_t wbuf[2], *wbufPtr;
1691 size_t insz, outsz;
1692 size_t res;
1693
1694 buf[0] = 'A';
1695 buf[1] = 0;
1696 wbuf[0] = 0;
1697 insz = 2;
1698 outsz = SIZEOF_WCHAR_T * 2;
1699 wbufPtr = wbuf;
1700 bufPtr = buf;
1701
1702 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1703 (char**)&wbufPtr, &outsz);
1704
1705 if (ICONV_FAILED(res, insz))
1706 {
1707 wxLogLastError(wxT("iconv"));
1708 wxLogError(_("Conversion to charset '%s' doesn't work."),
1709 nameCS.c_str());
1710 }
1711 else // ok, can convert to this encoding, remember it
1712 {
1713 ms_wcCharsetName = nameCS;
1714 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1715 }
1716 }
1717 }
1718 else // use charset not requiring byte swapping
1719 {
1720 ms_wcCharsetName = nameXE;
1721 }
1722 }
1723
1724 wxLogTrace(TRACE_STRCONV,
1725 wxT("iconv wchar_t charset is \"%s\"%s"),
1726 ms_wcCharsetName.empty() ? _T("<none>")
1727 : ms_wcCharsetName.c_str(),
1728 ms_wcNeedsSwap ? _T(" (needs swap)")
1729 : _T(""));
1730 }
1731 else // we already have ms_wcCharsetName
1732 {
1733 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1734 }
1735
1736 if ( ms_wcCharsetName.empty() )
1737 {
1738 w2m = ICONV_T_INVALID;
1739 }
1740 else
1741 {
1742 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1743 if ( w2m == ICONV_T_INVALID )
1744 {
1745 wxLogTrace(TRACE_STRCONV,
1746 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1747 ms_wcCharsetName.c_str(), cname.data());
1748 }
1749 }
1750 }
1751
1752 wxMBConv_iconv::~wxMBConv_iconv()
1753 {
1754 if ( m2w != ICONV_T_INVALID )
1755 iconv_close(m2w);
1756 if ( w2m != ICONV_T_INVALID )
1757 iconv_close(w2m);
1758 }
1759
1760 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1761 {
1762 // find the string length: notice that must be done differently for
1763 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1764 size_t inbuf;
1765 const size_t nulLen = GetMBNulLen();
1766 switch ( nulLen )
1767 {
1768 default:
1769 return wxCONV_FAILED;
1770
1771 case 1:
1772 inbuf = strlen(psz); // arguably more optimized than our version
1773 break;
1774
1775 case 2:
1776 case 4:
1777 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1778 // they also have to start at character boundary and not span two
1779 // adjacent characters
1780 const char *p;
1781 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1782 ;
1783 inbuf = p - psz;
1784 break;
1785 }
1786
1787 #if wxUSE_THREADS
1788 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1789 // Unfortunately there is a couple of global wxCSConv objects such as
1790 // wxConvLocal that are used all over wx code, so we have to make sure
1791 // the handle is used by at most one thread at the time. Otherwise
1792 // only a few wx classes would be safe to use from non-main threads
1793 // as MB<->WC conversion would fail "randomly".
1794 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1795 #endif // wxUSE_THREADS
1796
1797
1798 size_t outbuf = n * SIZEOF_WCHAR_T;
1799 size_t res, cres;
1800 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1801 wchar_t *bufPtr = buf;
1802 const char *pszPtr = psz;
1803
1804 if (buf)
1805 {
1806 // have destination buffer, convert there
1807 cres = iconv(m2w,
1808 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1809 (char**)&bufPtr, &outbuf);
1810 res = n - (outbuf / SIZEOF_WCHAR_T);
1811
1812 if (ms_wcNeedsSwap)
1813 {
1814 // convert to native endianness
1815 for ( unsigned i = 0; i < res; i++ )
1816 buf[n] = WC_BSWAP(buf[i]);
1817 }
1818
1819 // NUL-terminate the string if there is any space left
1820 if (res < n)
1821 buf[res] = 0;
1822 }
1823 else
1824 {
1825 // no destination buffer... convert using temp buffer
1826 // to calculate destination buffer requirement
1827 wchar_t tbuf[8];
1828 res = 0;
1829 do {
1830 bufPtr = tbuf;
1831 outbuf = 8*SIZEOF_WCHAR_T;
1832
1833 cres = iconv(m2w,
1834 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1835 (char**)&bufPtr, &outbuf );
1836
1837 res += 8-(outbuf/SIZEOF_WCHAR_T);
1838 } while ((cres==(size_t)-1) && (errno==E2BIG));
1839 }
1840
1841 if (ICONV_FAILED(cres, inbuf))
1842 {
1843 //VS: it is ok if iconv fails, hence trace only
1844 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1845 return wxCONV_FAILED;
1846 }
1847
1848 return res;
1849 }
1850
1851 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1852 {
1853 #if wxUSE_THREADS
1854 // NB: explained in MB2WC
1855 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1856 #endif
1857
1858 size_t inlen = wxWcslen(psz);
1859 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1860 size_t outbuf = n;
1861 size_t res, cres;
1862
1863 wchar_t *tmpbuf = 0;
1864
1865 if (ms_wcNeedsSwap)
1866 {
1867 // need to copy to temp buffer to switch endianness
1868 // (doing WC_BSWAP twice on the original buffer won't help, as it
1869 // could be in read-only memory, or be accessed in some other thread)
1870 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1871 for ( size_t i = 0; i < inlen; i++ )
1872 tmpbuf[n] = WC_BSWAP(psz[i]);
1873 tmpbuf[inlen] = L'\0';
1874 psz = tmpbuf;
1875 }
1876
1877 if (buf)
1878 {
1879 // have destination buffer, convert there
1880 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1881
1882 res = n-outbuf;
1883
1884 // NB: iconv was given only wcslen(psz) characters on input, and so
1885 // it couldn't convert the trailing zero. Let's do it ourselves
1886 // if there's some room left for it in the output buffer.
1887 if (res < n)
1888 buf[0] = 0;
1889 }
1890 else
1891 {
1892 // no destination buffer... convert using temp buffer
1893 // to calculate destination buffer requirement
1894 char tbuf[16];
1895 res = 0;
1896 do {
1897 buf = tbuf; outbuf = 16;
1898
1899 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1900
1901 res += 16 - outbuf;
1902 } while ((cres==(size_t)-1) && (errno==E2BIG));
1903 }
1904
1905 if (ms_wcNeedsSwap)
1906 {
1907 free(tmpbuf);
1908 }
1909
1910 if (ICONV_FAILED(cres, inbuf))
1911 {
1912 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1913 return wxCONV_FAILED;
1914 }
1915
1916 return res;
1917 }
1918
1919 size_t wxMBConv_iconv::GetMBNulLen() const
1920 {
1921 if ( m_minMBCharWidth == 0 )
1922 {
1923 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1924
1925 #if wxUSE_THREADS
1926 // NB: explained in MB2WC
1927 wxMutexLocker lock(self->m_iconvMutex);
1928 #endif
1929
1930 wchar_t *wnul = L"";
1931 char buf[8]; // should be enough for NUL in any encoding
1932 size_t inLen = sizeof(wchar_t),
1933 outLen = WXSIZEOF(buf);
1934 char *in = (char *)wnul;
1935 char *out = buf;
1936 if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1937 {
1938 self->m_minMBCharWidth = (size_t)-1;
1939 }
1940 else // ok
1941 {
1942 self->m_minMBCharWidth = out - buf;
1943 }
1944 }
1945
1946 return m_minMBCharWidth;
1947 }
1948
1949 #endif // HAVE_ICONV
1950
1951
1952 // ============================================================================
1953 // Win32 conversion classes
1954 // ============================================================================
1955
1956 #ifdef wxHAVE_WIN32_MB2WC
1957
1958 // from utils.cpp
1959 #if wxUSE_FONTMAP
1960 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1961 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1962 #endif
1963
1964 class wxMBConv_win32 : public wxMBConv
1965 {
1966 public:
1967 wxMBConv_win32()
1968 {
1969 m_CodePage = CP_ACP;
1970 m_minMBCharWidth = 0;
1971 }
1972
1973 wxMBConv_win32(const wxMBConv_win32& conv)
1974 {
1975 m_CodePage = conv.m_CodePage;
1976 m_minMBCharWidth = conv.m_minMBCharWidth;
1977 }
1978
1979 #if wxUSE_FONTMAP
1980 wxMBConv_win32(const wxChar* name)
1981 {
1982 m_CodePage = wxCharsetToCodepage(name);
1983 m_minMBCharWidth = 0;
1984 }
1985
1986 wxMBConv_win32(wxFontEncoding encoding)
1987 {
1988 m_CodePage = wxEncodingToCodepage(encoding);
1989 m_minMBCharWidth = 0;
1990 }
1991 #endif // wxUSE_FONTMAP
1992
1993 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1994 {
1995 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1996 // the behaviour is not compatible with the Unix version (using iconv)
1997 // and break the library itself, e.g. wxTextInputStream::NextChar()
1998 // wouldn't work if reading an incomplete MB char didn't result in an
1999 // error
2000 //
2001 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2002 // Win XP or newer and it is not supported for UTF-[78] so we always
2003 // use our own conversions in this case. See
2004 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2005 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2006 if ( m_CodePage == CP_UTF8 )
2007 {
2008 return wxConvUTF8.MB2WC(buf, psz, n);
2009 }
2010
2011 if ( m_CodePage == CP_UTF7 )
2012 {
2013 return wxConvUTF7.MB2WC(buf, psz, n);
2014 }
2015
2016 int flags = 0;
2017 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2018 IsAtLeastWin2kSP4() )
2019 {
2020 flags = MB_ERR_INVALID_CHARS;
2021 }
2022
2023 const size_t len = ::MultiByteToWideChar
2024 (
2025 m_CodePage, // code page
2026 flags, // flags: fall on error
2027 psz, // input string
2028 -1, // its length (NUL-terminated)
2029 buf, // output string
2030 buf ? n : 0 // size of output buffer
2031 );
2032 if ( !len )
2033 {
2034 // function totally failed
2035 return wxCONV_FAILED;
2036 }
2037
2038 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2039 // check if we succeeded, by doing a double trip:
2040 if ( !flags && buf )
2041 {
2042 const size_t mbLen = strlen(psz);
2043 wxCharBuffer mbBuf(mbLen);
2044 if ( ::WideCharToMultiByte
2045 (
2046 m_CodePage,
2047 0,
2048 buf,
2049 -1,
2050 mbBuf.data(),
2051 mbLen + 1, // size in bytes, not length
2052 NULL,
2053 NULL
2054 ) == 0 ||
2055 strcmp(mbBuf, psz) != 0 )
2056 {
2057 // we didn't obtain the same thing we started from, hence
2058 // the conversion was lossy and we consider that it failed
2059 return wxCONV_FAILED;
2060 }
2061 }
2062
2063 // note that it returns count of written chars for buf != NULL and size
2064 // of the needed buffer for buf == NULL so in either case the length of
2065 // the string (which never includes the terminating NUL) is one less
2066 return len - 1;
2067 }
2068
2069 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2070 {
2071 /*
2072 we have a problem here: by default, WideCharToMultiByte() may
2073 replace characters unrepresentable in the target code page with bad
2074 quality approximations such as turning "1/2" symbol (U+00BD) into
2075 "1" for the code pages which don't have it and we, obviously, want
2076 to avoid this at any price
2077
2078 the trouble is that this function does it _silently_, i.e. it won't
2079 even tell us whether it did or not... Win98/2000 and higher provide
2080 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2081 we have to resort to a round trip, i.e. check that converting back
2082 results in the same string -- this is, of course, expensive but
2083 otherwise we simply can't be sure to not garble the data.
2084 */
2085
2086 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2087 // it doesn't work with CJK encodings (which we test for rather roughly
2088 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2089 // supporting it
2090 BOOL usedDef wxDUMMY_INITIALIZE(false);
2091 BOOL *pUsedDef;
2092 int flags;
2093 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2094 {
2095 // it's our lucky day
2096 flags = WC_NO_BEST_FIT_CHARS;
2097 pUsedDef = &usedDef;
2098 }
2099 else // old system or unsupported encoding
2100 {
2101 flags = 0;
2102 pUsedDef = NULL;
2103 }
2104
2105 const size_t len = ::WideCharToMultiByte
2106 (
2107 m_CodePage, // code page
2108 flags, // either none or no best fit
2109 pwz, // input string
2110 -1, // it is (wide) NUL-terminated
2111 buf, // output buffer
2112 buf ? n : 0, // and its size
2113 NULL, // default "replacement" char
2114 pUsedDef // [out] was it used?
2115 );
2116
2117 if ( !len )
2118 {
2119 // function totally failed
2120 return wxCONV_FAILED;
2121 }
2122
2123 // if we were really converting, check if we succeeded
2124 if ( buf )
2125 {
2126 if ( flags )
2127 {
2128 // check if the conversion failed, i.e. if any replacements
2129 // were done
2130 if ( usedDef )
2131 return wxCONV_FAILED;
2132 }
2133 else // we must resort to double tripping...
2134 {
2135 wxWCharBuffer wcBuf(n);
2136 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2137 wcscmp(wcBuf, pwz) != 0 )
2138 {
2139 // we didn't obtain the same thing we started from, hence
2140 // the conversion was lossy and we consider that it failed
2141 return wxCONV_FAILED;
2142 }
2143 }
2144 }
2145
2146 // see the comment above for the reason of "len - 1"
2147 return len - 1;
2148 }
2149
2150 virtual size_t GetMBNulLen() const
2151 {
2152 if ( m_minMBCharWidth == 0 )
2153 {
2154 int len = ::WideCharToMultiByte
2155 (
2156 m_CodePage, // code page
2157 0, // no flags
2158 L"", // input string
2159 1, // translate just the NUL
2160 NULL, // output buffer
2161 0, // and its size
2162 NULL, // no replacement char
2163 NULL // [out] don't care if it was used
2164 );
2165
2166 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2167 switch ( len )
2168 {
2169 default:
2170 wxLogDebug(_T("Unexpected NUL length %d"), len);
2171 // fall through
2172
2173 case 0:
2174 self->m_minMBCharWidth = (size_t)-1;
2175 break;
2176
2177 case 1:
2178 case 2:
2179 case 4:
2180 self->m_minMBCharWidth = len;
2181 break;
2182 }
2183 }
2184
2185 return m_minMBCharWidth;
2186 }
2187
2188 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2189
2190 bool IsOk() const { return m_CodePage != -1; }
2191
2192 private:
2193 static bool CanUseNoBestFit()
2194 {
2195 static int s_isWin98Or2k = -1;
2196
2197 if ( s_isWin98Or2k == -1 )
2198 {
2199 int verMaj, verMin;
2200 switch ( wxGetOsVersion(&verMaj, &verMin) )
2201 {
2202 case wxWIN95:
2203 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2204 break;
2205
2206 case wxWINDOWS_NT:
2207 s_isWin98Or2k = verMaj >= 5;
2208 break;
2209
2210 default:
2211 // unknown, be conseravtive by default
2212 s_isWin98Or2k = 0;
2213 }
2214
2215 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2216 }
2217
2218 return s_isWin98Or2k == 1;
2219 }
2220
2221 static bool IsAtLeastWin2kSP4()
2222 {
2223 #ifdef __WXWINCE__
2224 return false;
2225 #else
2226 static int s_isAtLeastWin2kSP4 = -1;
2227
2228 if ( s_isAtLeastWin2kSP4 == -1 )
2229 {
2230 OSVERSIONINFOEX ver;
2231
2232 memset(&ver, 0, sizeof(ver));
2233 ver.dwOSVersionInfoSize = sizeof(ver);
2234 GetVersionEx((OSVERSIONINFO*)&ver);
2235
2236 s_isAtLeastWin2kSP4 =
2237 ((ver.dwMajorVersion > 5) || // Vista+
2238 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2239 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2240 ver.wServicePackMajor >= 4)) // 2000 SP4+
2241 ? 1 : 0;
2242 }
2243
2244 return s_isAtLeastWin2kSP4 == 1;
2245 #endif
2246 }
2247
2248
2249 // the code page we're working with
2250 long m_CodePage;
2251
2252 // cached result of GetMBNulLen(), set to 0 initially meaning
2253 // "unknown"
2254 size_t m_minMBCharWidth;
2255 };
2256
2257 #endif // wxHAVE_WIN32_MB2WC
2258
2259 // ============================================================================
2260 // Cocoa conversion classes
2261 // ============================================================================
2262
2263 #if defined(__WXCOCOA__)
2264
2265 // RN: There is no UTF-32 support in either Core Foundation or
2266 // Cocoa. Strangely enough, internally Core Foundation uses
2267 // UTF 32 internally quite a bit - its just not public (yet).
2268
2269 #include <CoreFoundation/CFString.h>
2270 #include <CoreFoundation/CFStringEncodingExt.h>
2271
2272 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2273 {
2274 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2275 if ( encoding == wxFONTENCODING_DEFAULT )
2276 {
2277 enc = CFStringGetSystemEncoding();
2278 }
2279 else switch( encoding)
2280 {
2281 case wxFONTENCODING_ISO8859_1 :
2282 enc = kCFStringEncodingISOLatin1 ;
2283 break ;
2284 case wxFONTENCODING_ISO8859_2 :
2285 enc = kCFStringEncodingISOLatin2;
2286 break ;
2287 case wxFONTENCODING_ISO8859_3 :
2288 enc = kCFStringEncodingISOLatin3 ;
2289 break ;
2290 case wxFONTENCODING_ISO8859_4 :
2291 enc = kCFStringEncodingISOLatin4;
2292 break ;
2293 case wxFONTENCODING_ISO8859_5 :
2294 enc = kCFStringEncodingISOLatinCyrillic;
2295 break ;
2296 case wxFONTENCODING_ISO8859_6 :
2297 enc = kCFStringEncodingISOLatinArabic;
2298 break ;
2299 case wxFONTENCODING_ISO8859_7 :
2300 enc = kCFStringEncodingISOLatinGreek;
2301 break ;
2302 case wxFONTENCODING_ISO8859_8 :
2303 enc = kCFStringEncodingISOLatinHebrew;
2304 break ;
2305 case wxFONTENCODING_ISO8859_9 :
2306 enc = kCFStringEncodingISOLatin5;
2307 break ;
2308 case wxFONTENCODING_ISO8859_10 :
2309 enc = kCFStringEncodingISOLatin6;
2310 break ;
2311 case wxFONTENCODING_ISO8859_11 :
2312 enc = kCFStringEncodingISOLatinThai;
2313 break ;
2314 case wxFONTENCODING_ISO8859_13 :
2315 enc = kCFStringEncodingISOLatin7;
2316 break ;
2317 case wxFONTENCODING_ISO8859_14 :
2318 enc = kCFStringEncodingISOLatin8;
2319 break ;
2320 case wxFONTENCODING_ISO8859_15 :
2321 enc = kCFStringEncodingISOLatin9;
2322 break ;
2323
2324 case wxFONTENCODING_KOI8 :
2325 enc = kCFStringEncodingKOI8_R;
2326 break ;
2327 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2328 enc = kCFStringEncodingDOSRussian;
2329 break ;
2330
2331 // case wxFONTENCODING_BULGARIAN :
2332 // enc = ;
2333 // break ;
2334
2335 case wxFONTENCODING_CP437 :
2336 enc =kCFStringEncodingDOSLatinUS ;
2337 break ;
2338 case wxFONTENCODING_CP850 :
2339 enc = kCFStringEncodingDOSLatin1;
2340 break ;
2341 case wxFONTENCODING_CP852 :
2342 enc = kCFStringEncodingDOSLatin2;
2343 break ;
2344 case wxFONTENCODING_CP855 :
2345 enc = kCFStringEncodingDOSCyrillic;
2346 break ;
2347 case wxFONTENCODING_CP866 :
2348 enc =kCFStringEncodingDOSRussian ;
2349 break ;
2350 case wxFONTENCODING_CP874 :
2351 enc = kCFStringEncodingDOSThai;
2352 break ;
2353 case wxFONTENCODING_CP932 :
2354 enc = kCFStringEncodingDOSJapanese;
2355 break ;
2356 case wxFONTENCODING_CP936 :
2357 enc =kCFStringEncodingDOSChineseSimplif ;
2358 break ;
2359 case wxFONTENCODING_CP949 :
2360 enc = kCFStringEncodingDOSKorean;
2361 break ;
2362 case wxFONTENCODING_CP950 :
2363 enc = kCFStringEncodingDOSChineseTrad;
2364 break ;
2365 case wxFONTENCODING_CP1250 :
2366 enc = kCFStringEncodingWindowsLatin2;
2367 break ;
2368 case wxFONTENCODING_CP1251 :
2369 enc =kCFStringEncodingWindowsCyrillic ;
2370 break ;
2371 case wxFONTENCODING_CP1252 :
2372 enc =kCFStringEncodingWindowsLatin1 ;
2373 break ;
2374 case wxFONTENCODING_CP1253 :
2375 enc = kCFStringEncodingWindowsGreek;
2376 break ;
2377 case wxFONTENCODING_CP1254 :
2378 enc = kCFStringEncodingWindowsLatin5;
2379 break ;
2380 case wxFONTENCODING_CP1255 :
2381 enc =kCFStringEncodingWindowsHebrew ;
2382 break ;
2383 case wxFONTENCODING_CP1256 :
2384 enc =kCFStringEncodingWindowsArabic ;
2385 break ;
2386 case wxFONTENCODING_CP1257 :
2387 enc = kCFStringEncodingWindowsBalticRim;
2388 break ;
2389 // This only really encodes to UTF7 (if that) evidently
2390 // case wxFONTENCODING_UTF7 :
2391 // enc = kCFStringEncodingNonLossyASCII ;
2392 // break ;
2393 case wxFONTENCODING_UTF8 :
2394 enc = kCFStringEncodingUTF8 ;
2395 break ;
2396 case wxFONTENCODING_EUC_JP :
2397 enc = kCFStringEncodingEUC_JP;
2398 break ;
2399 case wxFONTENCODING_UTF16 :
2400 enc = kCFStringEncodingUnicode ;
2401 break ;
2402 case wxFONTENCODING_MACROMAN :
2403 enc = kCFStringEncodingMacRoman ;
2404 break ;
2405 case wxFONTENCODING_MACJAPANESE :
2406 enc = kCFStringEncodingMacJapanese ;
2407 break ;
2408 case wxFONTENCODING_MACCHINESETRAD :
2409 enc = kCFStringEncodingMacChineseTrad ;
2410 break ;
2411 case wxFONTENCODING_MACKOREAN :
2412 enc = kCFStringEncodingMacKorean ;
2413 break ;
2414 case wxFONTENCODING_MACARABIC :
2415 enc = kCFStringEncodingMacArabic ;
2416 break ;
2417 case wxFONTENCODING_MACHEBREW :
2418 enc = kCFStringEncodingMacHebrew ;
2419 break ;
2420 case wxFONTENCODING_MACGREEK :
2421 enc = kCFStringEncodingMacGreek ;
2422 break ;
2423 case wxFONTENCODING_MACCYRILLIC :
2424 enc = kCFStringEncodingMacCyrillic ;
2425 break ;
2426 case wxFONTENCODING_MACDEVANAGARI :
2427 enc = kCFStringEncodingMacDevanagari ;
2428 break ;
2429 case wxFONTENCODING_MACGURMUKHI :
2430 enc = kCFStringEncodingMacGurmukhi ;
2431 break ;
2432 case wxFONTENCODING_MACGUJARATI :
2433 enc = kCFStringEncodingMacGujarati ;
2434 break ;
2435 case wxFONTENCODING_MACORIYA :
2436 enc = kCFStringEncodingMacOriya ;
2437 break ;
2438 case wxFONTENCODING_MACBENGALI :
2439 enc = kCFStringEncodingMacBengali ;
2440 break ;
2441 case wxFONTENCODING_MACTAMIL :
2442 enc = kCFStringEncodingMacTamil ;
2443 break ;
2444 case wxFONTENCODING_MACTELUGU :
2445 enc = kCFStringEncodingMacTelugu ;
2446 break ;
2447 case wxFONTENCODING_MACKANNADA :
2448 enc = kCFStringEncodingMacKannada ;
2449 break ;
2450 case wxFONTENCODING_MACMALAJALAM :
2451 enc = kCFStringEncodingMacMalayalam ;
2452 break ;
2453 case wxFONTENCODING_MACSINHALESE :
2454 enc = kCFStringEncodingMacSinhalese ;
2455 break ;
2456 case wxFONTENCODING_MACBURMESE :
2457 enc = kCFStringEncodingMacBurmese ;
2458 break ;
2459 case wxFONTENCODING_MACKHMER :
2460 enc = kCFStringEncodingMacKhmer ;
2461 break ;
2462 case wxFONTENCODING_MACTHAI :
2463 enc = kCFStringEncodingMacThai ;
2464 break ;
2465 case wxFONTENCODING_MACLAOTIAN :
2466 enc = kCFStringEncodingMacLaotian ;
2467 break ;
2468 case wxFONTENCODING_MACGEORGIAN :
2469 enc = kCFStringEncodingMacGeorgian ;
2470 break ;
2471 case wxFONTENCODING_MACARMENIAN :
2472 enc = kCFStringEncodingMacArmenian ;
2473 break ;
2474 case wxFONTENCODING_MACCHINESESIMP :
2475 enc = kCFStringEncodingMacChineseSimp ;
2476 break ;
2477 case wxFONTENCODING_MACTIBETAN :
2478 enc = kCFStringEncodingMacTibetan ;
2479 break ;
2480 case wxFONTENCODING_MACMONGOLIAN :
2481 enc = kCFStringEncodingMacMongolian ;
2482 break ;
2483 case wxFONTENCODING_MACETHIOPIC :
2484 enc = kCFStringEncodingMacEthiopic ;
2485 break ;
2486 case wxFONTENCODING_MACCENTRALEUR :
2487 enc = kCFStringEncodingMacCentralEurRoman ;
2488 break ;
2489 case wxFONTENCODING_MACVIATNAMESE :
2490 enc = kCFStringEncodingMacVietnamese ;
2491 break ;
2492 case wxFONTENCODING_MACARABICEXT :
2493 enc = kCFStringEncodingMacExtArabic ;
2494 break ;
2495 case wxFONTENCODING_MACSYMBOL :
2496 enc = kCFStringEncodingMacSymbol ;
2497 break ;
2498 case wxFONTENCODING_MACDINGBATS :
2499 enc = kCFStringEncodingMacDingbats ;
2500 break ;
2501 case wxFONTENCODING_MACTURKISH :
2502 enc = kCFStringEncodingMacTurkish ;
2503 break ;
2504 case wxFONTENCODING_MACCROATIAN :
2505 enc = kCFStringEncodingMacCroatian ;
2506 break ;
2507 case wxFONTENCODING_MACICELANDIC :
2508 enc = kCFStringEncodingMacIcelandic ;
2509 break ;
2510 case wxFONTENCODING_MACROMANIAN :
2511 enc = kCFStringEncodingMacRomanian ;
2512 break ;
2513 case wxFONTENCODING_MACCELTIC :
2514 enc = kCFStringEncodingMacCeltic ;
2515 break ;
2516 case wxFONTENCODING_MACGAELIC :
2517 enc = kCFStringEncodingMacGaelic ;
2518 break ;
2519 // case wxFONTENCODING_MACKEYBOARD :
2520 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2521 // break ;
2522 default :
2523 // because gcc is picky
2524 break ;
2525 } ;
2526 return enc ;
2527 }
2528
2529 class wxMBConv_cocoa : public wxMBConv
2530 {
2531 public:
2532 wxMBConv_cocoa()
2533 {
2534 Init(CFStringGetSystemEncoding()) ;
2535 }
2536
2537 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2538 {
2539 m_encoding = conv.m_encoding;
2540 }
2541
2542 #if wxUSE_FONTMAP
2543 wxMBConv_cocoa(const wxChar* name)
2544 {
2545 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2546 }
2547 #endif
2548
2549 wxMBConv_cocoa(wxFontEncoding encoding)
2550 {
2551 Init( wxCFStringEncFromFontEnc(encoding) );
2552 }
2553
2554 ~wxMBConv_cocoa()
2555 {
2556 }
2557
2558 void Init( CFStringEncoding encoding)
2559 {
2560 m_encoding = encoding ;
2561 }
2562
2563 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2564 {
2565 wxASSERT(szUnConv);
2566
2567 CFStringRef theString = CFStringCreateWithBytes (
2568 NULL, //the allocator
2569 (const UInt8*)szUnConv,
2570 strlen(szUnConv),
2571 m_encoding,
2572 false //no BOM/external representation
2573 );
2574
2575 wxASSERT(theString);
2576
2577 size_t nOutLength = CFStringGetLength(theString);
2578
2579 if (szOut == NULL)
2580 {
2581 CFRelease(theString);
2582 return nOutLength;
2583 }
2584
2585 CFRange theRange = { 0, nOutSize };
2586
2587 #if SIZEOF_WCHAR_T == 4
2588 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2589 #endif
2590
2591 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2592
2593 CFRelease(theString);
2594
2595 szUniCharBuffer[nOutLength] = '\0' ;
2596
2597 #if SIZEOF_WCHAR_T == 4
2598 wxMBConvUTF16 converter ;
2599 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2600 delete[] szUniCharBuffer;
2601 #endif
2602
2603 return nOutLength;
2604 }
2605
2606 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2607 {
2608 wxASSERT(szUnConv);
2609
2610 size_t nRealOutSize;
2611 size_t nBufSize = wxWcslen(szUnConv);
2612 UniChar* szUniBuffer = (UniChar*) szUnConv;
2613
2614 #if SIZEOF_WCHAR_T == 4
2615 wxMBConvUTF16 converter ;
2616 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2617 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2618 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2619 nBufSize /= sizeof(UniChar);
2620 #endif
2621
2622 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2623 NULL, //allocator
2624 szUniBuffer,
2625 nBufSize,
2626 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2627 );
2628
2629 wxASSERT(theString);
2630
2631 //Note that CER puts a BOM when converting to unicode
2632 //so we check and use getchars instead in that case
2633 if (m_encoding == kCFStringEncodingUnicode)
2634 {
2635 if (szOut != NULL)
2636 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2637
2638 nRealOutSize = CFStringGetLength(theString) + 1;
2639 }
2640 else
2641 {
2642 CFStringGetBytes(
2643 theString,
2644 CFRangeMake(0, CFStringGetLength(theString)),
2645 m_encoding,
2646 0, //what to put in characters that can't be converted -
2647 //0 tells CFString to return NULL if it meets such a character
2648 false, //not an external representation
2649 (UInt8*) szOut,
2650 nOutSize,
2651 (CFIndex*) &nRealOutSize
2652 );
2653 }
2654
2655 CFRelease(theString);
2656
2657 #if SIZEOF_WCHAR_T == 4
2658 delete[] szUniBuffer;
2659 #endif
2660
2661 return nRealOutSize - 1;
2662 }
2663
2664 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2665
2666 bool IsOk() const
2667 {
2668 return m_encoding != kCFStringEncodingInvalidId &&
2669 CFStringIsEncodingAvailable(m_encoding);
2670 }
2671
2672 private:
2673 CFStringEncoding m_encoding ;
2674 };
2675
2676 #endif // defined(__WXCOCOA__)
2677
2678 // ============================================================================
2679 // Mac conversion classes
2680 // ============================================================================
2681
2682 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2683
2684 class wxMBConv_mac : public wxMBConv
2685 {
2686 public:
2687 wxMBConv_mac()
2688 {
2689 Init(CFStringGetSystemEncoding()) ;
2690 }
2691
2692 wxMBConv_mac(const wxMBConv_mac& conv)
2693 {
2694 Init(conv.m_char_encoding);
2695 }
2696
2697 #if wxUSE_FONTMAP
2698 wxMBConv_mac(const wxChar* name)
2699 {
2700 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2701 }
2702 #endif
2703
2704 wxMBConv_mac(wxFontEncoding encoding)
2705 {
2706 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2707 }
2708
2709 ~wxMBConv_mac()
2710 {
2711 OSStatus status = noErr ;
2712 status = TECDisposeConverter(m_MB2WC_converter);
2713 status = TECDisposeConverter(m_WC2MB_converter);
2714 }
2715
2716
2717 void Init( TextEncodingBase encoding)
2718 {
2719 OSStatus status = noErr ;
2720 m_char_encoding = encoding ;
2721 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2722
2723 status = TECCreateConverter(&m_MB2WC_converter,
2724 m_char_encoding,
2725 m_unicode_encoding);
2726 status = TECCreateConverter(&m_WC2MB_converter,
2727 m_unicode_encoding,
2728 m_char_encoding);
2729 }
2730
2731 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2732 {
2733 OSStatus status = noErr ;
2734 ByteCount byteOutLen ;
2735 ByteCount byteInLen = strlen(psz) ;
2736 wchar_t *tbuf = NULL ;
2737 UniChar* ubuf = NULL ;
2738 size_t res = 0 ;
2739
2740 if (buf == NULL)
2741 {
2742 //apple specs say at least 32
2743 n = wxMax( 32 , byteInLen ) ;
2744 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2745 }
2746 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2747 #if SIZEOF_WCHAR_T == 4
2748 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2749 #else
2750 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2751 #endif
2752 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2753 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2754 #if SIZEOF_WCHAR_T == 4
2755 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2756 // is not properly terminated we get random characters at the end
2757 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2758 wxMBConvUTF16 converter ;
2759 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2760 free( ubuf ) ;
2761 #else
2762 res = byteOutLen / sizeof( UniChar ) ;
2763 #endif
2764 if ( buf == NULL )
2765 free(tbuf) ;
2766
2767 if ( buf && res < n)
2768 buf[res] = 0;
2769
2770 return res ;
2771 }
2772
2773 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2774 {
2775 OSStatus status = noErr ;
2776 ByteCount byteOutLen ;
2777 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2778
2779 char *tbuf = NULL ;
2780
2781 if (buf == NULL)
2782 {
2783 //apple specs say at least 32
2784 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2785 tbuf = (char*) malloc( n ) ;
2786 }
2787
2788 ByteCount byteBufferLen = n ;
2789 UniChar* ubuf = NULL ;
2790 #if SIZEOF_WCHAR_T == 4
2791 wxMBConvUTF16 converter ;
2792 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2793 byteInLen = unicharlen ;
2794 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2795 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2796 #else
2797 ubuf = (UniChar*) psz ;
2798 #endif
2799 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2800 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2801 #if SIZEOF_WCHAR_T == 4
2802 free( ubuf ) ;
2803 #endif
2804 if ( buf == NULL )
2805 free(tbuf) ;
2806
2807 size_t res = byteOutLen ;
2808 if ( buf && res < n)
2809 {
2810 buf[res] = 0;
2811
2812 //we need to double-trip to verify it didn't insert any ? in place
2813 //of bogus characters
2814 wxWCharBuffer wcBuf(n);
2815 size_t pszlen = wxWcslen(psz);
2816 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2817 wxWcslen(wcBuf) != pszlen ||
2818 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2819 {
2820 // we didn't obtain the same thing we started from, hence
2821 // the conversion was lossy and we consider that it failed
2822 return wxCONV_FAILED;
2823 }
2824 }
2825
2826 return res ;
2827 }
2828
2829 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2830
2831 bool IsOk() const
2832 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2833
2834 private:
2835 TECObjectRef m_MB2WC_converter ;
2836 TECObjectRef m_WC2MB_converter ;
2837
2838 TextEncodingBase m_char_encoding ;
2839 TextEncodingBase m_unicode_encoding ;
2840 };
2841
2842 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2843
2844 // ============================================================================
2845 // wxEncodingConverter based conversion classes
2846 // ============================================================================
2847
2848 #if wxUSE_FONTMAP
2849
2850 class wxMBConv_wxwin : public wxMBConv
2851 {
2852 private:
2853 void Init()
2854 {
2855 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2856 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2857 }
2858
2859 public:
2860 // temporarily just use wxEncodingConverter stuff,
2861 // so that it works while a better implementation is built
2862 wxMBConv_wxwin(const wxChar* name)
2863 {
2864 if (name)
2865 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2866 else
2867 m_enc = wxFONTENCODING_SYSTEM;
2868
2869 Init();
2870 }
2871
2872 wxMBConv_wxwin(wxFontEncoding enc)
2873 {
2874 m_enc = enc;
2875
2876 Init();
2877 }
2878
2879 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2880 {
2881 size_t inbuf = strlen(psz);
2882 if (buf)
2883 {
2884 if (!m2w.Convert(psz,buf))
2885 return wxCONV_FAILED;
2886 }
2887 return inbuf;
2888 }
2889
2890 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2891 {
2892 const size_t inbuf = wxWcslen(psz);
2893 if (buf)
2894 {
2895 if (!w2m.Convert(psz,buf))
2896 return wxCONV_FAILED;
2897 }
2898
2899 return inbuf;
2900 }
2901
2902 virtual size_t GetMBNulLen() const
2903 {
2904 switch ( m_enc )
2905 {
2906 case wxFONTENCODING_UTF16BE:
2907 case wxFONTENCODING_UTF16LE:
2908 return 2;
2909
2910 case wxFONTENCODING_UTF32BE:
2911 case wxFONTENCODING_UTF32LE:
2912 return 4;
2913
2914 default:
2915 return 1;
2916 }
2917 }
2918
2919 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2920
2921 bool IsOk() const { return m_ok; }
2922
2923 public:
2924 wxFontEncoding m_enc;
2925 wxEncodingConverter m2w, w2m;
2926
2927 private:
2928 // were we initialized successfully?
2929 bool m_ok;
2930
2931 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2932 };
2933
2934 // make the constructors available for unit testing
2935 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2936 {
2937 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2938 if ( !result->IsOk() )
2939 {
2940 delete result;
2941 return 0;
2942 }
2943 return result;
2944 }
2945
2946 #endif // wxUSE_FONTMAP
2947
2948 // ============================================================================
2949 // wxCSConv implementation
2950 // ============================================================================
2951
2952 void wxCSConv::Init()
2953 {
2954 m_name = NULL;
2955 m_convReal = NULL;
2956 m_deferred = true;
2957 }
2958
2959 wxCSConv::wxCSConv(const wxChar *charset)
2960 {
2961 Init();
2962
2963 if ( charset )
2964 {
2965 SetName(charset);
2966 }
2967
2968 #if wxUSE_FONTMAP
2969 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2970 #else
2971 m_encoding = wxFONTENCODING_SYSTEM;
2972 #endif
2973 }
2974
2975 wxCSConv::wxCSConv(wxFontEncoding encoding)
2976 {
2977 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2978 {
2979 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2980
2981 encoding = wxFONTENCODING_SYSTEM;
2982 }
2983
2984 Init();
2985
2986 m_encoding = encoding;
2987 }
2988
2989 wxCSConv::~wxCSConv()
2990 {
2991 Clear();
2992 }
2993
2994 wxCSConv::wxCSConv(const wxCSConv& conv)
2995 : wxMBConv()
2996 {
2997 Init();
2998
2999 SetName(conv.m_name);
3000 m_encoding = conv.m_encoding;
3001 }
3002
3003 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3004 {
3005 Clear();
3006
3007 SetName(conv.m_name);
3008 m_encoding = conv.m_encoding;
3009
3010 return *this;
3011 }
3012
3013 void wxCSConv::Clear()
3014 {
3015 free(m_name);
3016 delete m_convReal;
3017
3018 m_name = NULL;
3019 m_convReal = NULL;
3020 }
3021
3022 void wxCSConv::SetName(const wxChar *charset)
3023 {
3024 if (charset)
3025 {
3026 m_name = wxStrdup(charset);
3027 m_deferred = true;
3028 }
3029 }
3030
3031 #if wxUSE_FONTMAP
3032 #include "wx/hashmap.h"
3033
3034 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3035 wxEncodingNameCache );
3036
3037 static wxEncodingNameCache gs_nameCache;
3038 #endif
3039
3040 wxMBConv *wxCSConv::DoCreate() const
3041 {
3042 #if wxUSE_FONTMAP
3043 wxLogTrace(TRACE_STRCONV,
3044 wxT("creating conversion for %s"),
3045 (m_name ? m_name
3046 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3047 #endif // wxUSE_FONTMAP
3048
3049 // check for the special case of ASCII or ISO8859-1 charset: as we have
3050 // special knowledge of it anyhow, we don't need to create a special
3051 // conversion object
3052 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3053 m_encoding == wxFONTENCODING_DEFAULT )
3054 {
3055 // don't convert at all
3056 return NULL;
3057 }
3058
3059 // we trust OS to do conversion better than we can so try external
3060 // conversion methods first
3061 //
3062 // the full order is:
3063 // 1. OS conversion (iconv() under Unix or Win32 API)
3064 // 2. hard coded conversions for UTF
3065 // 3. wxEncodingConverter as fall back
3066
3067 // step (1)
3068 #ifdef HAVE_ICONV
3069 #if !wxUSE_FONTMAP
3070 if ( m_name )
3071 #endif // !wxUSE_FONTMAP
3072 {
3073 wxString name(m_name);
3074 wxFontEncoding encoding(m_encoding);
3075
3076 if ( !name.empty() )
3077 {
3078 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3079 if ( conv->IsOk() )
3080 return conv;
3081
3082 delete conv;
3083
3084 #if wxUSE_FONTMAP
3085 encoding =
3086 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3087 #endif // wxUSE_FONTMAP
3088 }
3089 #if wxUSE_FONTMAP
3090 {
3091 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3092 if ( it != gs_nameCache.end() )
3093 {
3094 if ( it->second.empty() )
3095 return NULL;
3096
3097 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3098 if ( conv->IsOk() )
3099 return conv;
3100
3101 delete conv;
3102 }
3103
3104 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3105
3106 for ( ; *names; ++names )
3107 {
3108 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3109 if ( conv->IsOk() )
3110 {
3111 gs_nameCache[encoding] = *names;
3112 return conv;
3113 }
3114
3115 delete conv;
3116 }
3117
3118 gs_nameCache[encoding] = _T(""); // cache the failure
3119 }
3120 #endif // wxUSE_FONTMAP
3121 }
3122 #endif // HAVE_ICONV
3123
3124 #ifdef wxHAVE_WIN32_MB2WC
3125 {
3126 #if wxUSE_FONTMAP
3127 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3128 : new wxMBConv_win32(m_encoding);
3129 if ( conv->IsOk() )
3130 return conv;
3131
3132 delete conv;
3133 #else
3134 return NULL;
3135 #endif
3136 }
3137 #endif // wxHAVE_WIN32_MB2WC
3138 #if defined(__WXMAC__)
3139 {
3140 // leave UTF16 and UTF32 to the built-ins of wx
3141 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3142 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3143 {
3144
3145 #if wxUSE_FONTMAP
3146 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3147 : new wxMBConv_mac(m_encoding);
3148 #else
3149 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3150 #endif
3151 if ( conv->IsOk() )
3152 return conv;
3153
3154 delete conv;
3155 }
3156 }
3157 #endif
3158 #if defined(__WXCOCOA__)
3159 {
3160 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3161 {
3162
3163 #if wxUSE_FONTMAP
3164 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3165 : new wxMBConv_cocoa(m_encoding);
3166 #else
3167 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3168 #endif
3169 if ( conv->IsOk() )
3170 return conv;
3171
3172 delete conv;
3173 }
3174 }
3175 #endif
3176 // step (2)
3177 wxFontEncoding enc = m_encoding;
3178 #if wxUSE_FONTMAP
3179 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3180 {
3181 // use "false" to suppress interactive dialogs -- we can be called from
3182 // anywhere and popping up a dialog from here is the last thing we want to
3183 // do
3184 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3185 }
3186 #endif // wxUSE_FONTMAP
3187
3188 switch ( enc )
3189 {
3190 case wxFONTENCODING_UTF7:
3191 return new wxMBConvUTF7;
3192
3193 case wxFONTENCODING_UTF8:
3194 return new wxMBConvUTF8;
3195
3196 case wxFONTENCODING_UTF16BE:
3197 return new wxMBConvUTF16BE;
3198
3199 case wxFONTENCODING_UTF16LE:
3200 return new wxMBConvUTF16LE;
3201
3202 case wxFONTENCODING_UTF32BE:
3203 return new wxMBConvUTF32BE;
3204
3205 case wxFONTENCODING_UTF32LE:
3206 return new wxMBConvUTF32LE;
3207
3208 default:
3209 // nothing to do but put here to suppress gcc warnings
3210 ;
3211 }
3212
3213 // step (3)
3214 #if wxUSE_FONTMAP
3215 {
3216 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3217 : new wxMBConv_wxwin(m_encoding);
3218 if ( conv->IsOk() )
3219 return conv;
3220
3221 delete conv;
3222 }
3223 #endif // wxUSE_FONTMAP
3224
3225 // NB: This is a hack to prevent deadlock. What could otherwise happen
3226 // in Unicode build: wxConvLocal creation ends up being here
3227 // because of some failure and logs the error. But wxLog will try to
3228 // attach timestamp, for which it will need wxConvLocal (to convert
3229 // time to char* and then wchar_t*), but that fails, tries to log
3230 // error, but wxLog has a (already locked) critical section that
3231 // guards static buffer.
3232 static bool alreadyLoggingError = false;
3233 if (!alreadyLoggingError)
3234 {
3235 alreadyLoggingError = true;
3236 wxLogError(_("Cannot convert from the charset '%s'!"),
3237 m_name ? m_name
3238 :
3239 #if wxUSE_FONTMAP
3240 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3241 #else // !wxUSE_FONTMAP
3242 wxString::Format(_("encoding %s"), m_encoding).c_str()
3243 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3244 );
3245 alreadyLoggingError = false;
3246 }
3247
3248 return NULL;
3249 }
3250
3251 void wxCSConv::CreateConvIfNeeded() const
3252 {
3253 if ( m_deferred )
3254 {
3255 wxCSConv *self = (wxCSConv *)this; // const_cast
3256
3257 #if wxUSE_INTL
3258 // if we don't have neither the name nor the encoding, use the default
3259 // encoding for this system
3260 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3261 {
3262 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3263 }
3264 #endif // wxUSE_INTL
3265
3266 self->m_convReal = DoCreate();
3267 self->m_deferred = false;
3268 }
3269 }
3270
3271 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3272 {
3273 CreateConvIfNeeded();
3274
3275 if (m_convReal)
3276 return m_convReal->MB2WC(buf, psz, n);
3277
3278 // latin-1 (direct)
3279 size_t len = strlen(psz);
3280
3281 if (buf)
3282 {
3283 for (size_t c = 0; c <= len; c++)
3284 buf[c] = (unsigned char)(psz[c]);
3285 }
3286
3287 return len;
3288 }
3289
3290 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3291 {
3292 CreateConvIfNeeded();
3293
3294 if (m_convReal)
3295 return m_convReal->WC2MB(buf, psz, n);
3296
3297 // latin-1 (direct)
3298 const size_t len = wxWcslen(psz);
3299 if (buf)
3300 {
3301 for (size_t c = 0; c <= len; c++)
3302 {
3303 if (psz[c] > 0xFF)
3304 return wxCONV_FAILED;
3305 buf[c] = (char)psz[c];
3306 }
3307 }
3308 else
3309 {
3310 for (size_t c = 0; c <= len; c++)
3311 {
3312 if (psz[c] > 0xFF)
3313 return wxCONV_FAILED;
3314 }
3315 }
3316
3317 return len;
3318 }
3319
3320 size_t wxCSConv::GetMBNulLen() const
3321 {
3322 CreateConvIfNeeded();
3323
3324 if ( m_convReal )
3325 {
3326 return m_convReal->GetMBNulLen();
3327 }
3328
3329 return 1;
3330 }
3331
3332 // ----------------------------------------------------------------------------
3333 // globals
3334 // ----------------------------------------------------------------------------
3335
3336 #ifdef __WINDOWS__
3337 static wxMBConv_win32 wxConvLibcObj;
3338 #elif defined(__WXMAC__) && !defined(__MACH__)
3339 static wxMBConv_mac wxConvLibcObj ;
3340 #else
3341 static wxMBConvLibc wxConvLibcObj;
3342 #endif
3343
3344 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3345 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3346 static wxMBConvUTF7 wxConvUTF7Obj;
3347 static wxMBConvUTF8 wxConvUTF8Obj;
3348
3349 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3350 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3351 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3352 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3353 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3354 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3355 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3356 #ifdef __WXOSX__
3357 wxConvUTF8Obj;
3358 #else
3359 wxConvLibcObj;
3360 #endif
3361
3362
3363 #else // !wxUSE_WCHAR_T
3364
3365 // stand-ins in absence of wchar_t
3366 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3367 wxConvISO8859_1,
3368 wxConvLocal,
3369 wxConvUTF8;
3370
3371 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T