]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Move wxMBConv_cf out of strconv.cpp and into strconv_cf.(cpp|h) by forking the files...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef __SALFORDC__
48 #include <clib.h>
49 #endif
50
51 #ifdef HAVE_ICONV
52 #include <iconv.h>
53 #include "wx/thread.h"
54 #endif
55
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
58
59 #ifdef __DARWIN__
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
62
63 #ifdef __WXMAC__
64 #ifndef __DARWIN__
65 #include <ATSUnicode.h>
66 #include <TextCommon.h>
67 #include <TextEncodingConverter.h>
68 #endif
69
70 // includes Mac headers
71 #include "wx/mac/private.h"
72 #endif
73
74
75 #define TRACE_STRCONV _T("strconv")
76
77 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
78 // be 4 bytes
79 #if SIZEOF_WCHAR_T == 2
80 #define WC_UTF16
81 #endif
82
83
84 // ============================================================================
85 // implementation
86 // ============================================================================
87
88 // helper function of cMB2WC(): check if n bytes at this location are all NUL
89 static bool NotAllNULs(const char *p, size_t n)
90 {
91 while ( n && *p++ == '\0' )
92 n--;
93
94 return n != 0;
95 }
96
97 // ----------------------------------------------------------------------------
98 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
99 // ----------------------------------------------------------------------------
100
101 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
102 {
103 if (input <= 0xffff)
104 {
105 if (output)
106 *output = (wxUint16) input;
107
108 return 1;
109 }
110 else if (input >= 0x110000)
111 {
112 return wxCONV_FAILED;
113 }
114 else
115 {
116 if (output)
117 {
118 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
119 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
120 }
121
122 return 2;
123 }
124 }
125
126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
127 {
128 if ((*input < 0xd800) || (*input > 0xdfff))
129 {
130 output = *input;
131 return 1;
132 }
133 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
134 {
135 output = *input;
136 return wxCONV_FAILED;
137 }
138 else
139 {
140 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
141 return 2;
142 }
143 }
144
145 #ifdef WC_UTF16
146 typedef wchar_t wxDecodeSurrogate_t;
147 #else // !WC_UTF16
148 typedef wxUint16 wxDecodeSurrogate_t;
149 #endif // WC_UTF16/!WC_UTF16
150
151 // returns the next UTF-32 character from the wchar_t buffer and advances the
152 // pointer to the character after this one
153 //
154 // if an invalid character is found, *pSrc is set to NULL, the caller must
155 // check for this
156 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
157 {
158 wxUint32 out;
159 const size_t
160 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
161 if ( n == wxCONV_FAILED )
162 *pSrc = NULL;
163 else
164 *pSrc += n;
165
166 return out;
167 }
168
169 // ----------------------------------------------------------------------------
170 // wxMBConv
171 // ----------------------------------------------------------------------------
172
173 size_t
174 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
175 const char *src, size_t srcLen) const
176 {
177 // although new conversion classes are supposed to implement this function
178 // directly, the existins ones only implement the old MB2WC() and so, to
179 // avoid to have to rewrite all conversion classes at once, we provide a
180 // default (but not efficient) implementation of this one in terms of the
181 // old function by copying the input to ensure that it's NUL-terminated and
182 // then using MB2WC() to convert it
183
184 // the number of chars [which would be] written to dst [if it were not NULL]
185 size_t dstWritten = 0;
186
187 // the number of NULs terminating this string
188 size_t nulLen = 0; // not really needed, but just to avoid warnings
189
190 // if we were not given the input size we just have to assume that the
191 // string is properly terminated as we have no way of knowing how long it
192 // is anyhow, but if we do have the size check whether there are enough
193 // NULs at the end
194 wxCharBuffer bufTmp;
195 const char *srcEnd;
196 if ( srcLen != wxNO_LEN )
197 {
198 // we need to know how to find the end of this string
199 nulLen = GetMBNulLen();
200 if ( nulLen == wxCONV_FAILED )
201 return wxCONV_FAILED;
202
203 // if there are enough NULs we can avoid the copy
204 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
205 {
206 // make a copy in order to properly NUL-terminate the string
207 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
208 char * const p = bufTmp.data();
209 memcpy(p, src, srcLen);
210 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
211 *s = '\0';
212
213 src = bufTmp;
214 }
215
216 srcEnd = src + srcLen;
217 }
218 else // quit after the first loop iteration
219 {
220 srcEnd = NULL;
221 }
222
223 for ( ;; )
224 {
225 // try to convert the current chunk
226 size_t lenChunk = MB2WC(NULL, src, 0);
227 if ( lenChunk == wxCONV_FAILED )
228 return wxCONV_FAILED;
229
230 lenChunk++; // for the L'\0' at the end of this chunk
231
232 dstWritten += lenChunk;
233
234 if ( lenChunk == 1 )
235 {
236 // nothing left in the input string, conversion succeeded
237 break;
238 }
239
240 if ( dst )
241 {
242 if ( dstWritten > dstLen )
243 return wxCONV_FAILED;
244
245 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
246 return wxCONV_FAILED;
247
248 dst += lenChunk;
249 }
250
251 if ( !srcEnd )
252 {
253 // we convert just one chunk in this case as this is the entire
254 // string anyhow
255 break;
256 }
257
258 // advance the input pointer past the end of this chunk
259 while ( NotAllNULs(src, nulLen) )
260 {
261 // notice that we must skip over multiple bytes here as we suppose
262 // that if NUL takes 2 or 4 bytes, then all the other characters do
263 // too and so if advanced by a single byte we might erroneously
264 // detect sequences of NUL bytes in the middle of the input
265 src += nulLen;
266 }
267
268 src += nulLen; // skipping over its terminator as well
269
270 // note that ">=" (and not just "==") is needed here as the terminator
271 // we skipped just above could be inside or just after the buffer
272 // delimited by inEnd
273 if ( src >= srcEnd )
274 break;
275 }
276
277 return dstWritten;
278 }
279
280 size_t
281 wxMBConv::FromWChar(char *dst, size_t dstLen,
282 const wchar_t *src, size_t srcLen) const
283 {
284 // the number of chars [which would be] written to dst [if it were not NULL]
285 size_t dstWritten = 0;
286
287 // make a copy of the input string unless it is already properly
288 // NUL-terminated
289 //
290 // if we don't know its length we have no choice but to assume that it is,
291 // indeed, properly terminated
292 wxWCharBuffer bufTmp;
293 if ( srcLen == wxNO_LEN )
294 {
295 srcLen = wxWcslen(src) + 1;
296 }
297 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
298 {
299 // make a copy in order to properly NUL-terminate the string
300 bufTmp = wxWCharBuffer(srcLen);
301 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
302 src = bufTmp;
303 }
304
305 const size_t lenNul = GetMBNulLen();
306 for ( const wchar_t * const srcEnd = src + srcLen;
307 src < srcEnd;
308 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
309 {
310 // try to convert the current chunk
311 size_t lenChunk = WC2MB(NULL, src, 0);
312
313 if ( lenChunk == wxCONV_FAILED )
314 return wxCONV_FAILED;
315
316 lenChunk += lenNul;
317 dstWritten += lenChunk;
318
319 if ( dst )
320 {
321 if ( dstWritten > dstLen )
322 return wxCONV_FAILED;
323
324 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
325 return wxCONV_FAILED;
326
327 dst += lenChunk;
328 }
329 }
330
331 return dstWritten;
332 }
333
334 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
335 {
336 size_t rc = ToWChar(outBuff, outLen, inBuff);
337 if ( rc != wxCONV_FAILED )
338 {
339 // ToWChar() returns the buffer length, i.e. including the trailing
340 // NUL, while this method doesn't take it into account
341 rc--;
342 }
343
344 return rc;
345 }
346
347 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
348 {
349 size_t rc = FromWChar(outBuff, outLen, inBuff);
350 if ( rc != wxCONV_FAILED )
351 {
352 rc -= GetMBNulLen();
353 }
354
355 return rc;
356 }
357
358 wxMBConv::~wxMBConv()
359 {
360 // nothing to do here (necessary for Darwin linking probably)
361 }
362
363 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
364 {
365 if ( psz )
366 {
367 // calculate the length of the buffer needed first
368 const size_t nLen = MB2WC(NULL, psz, 0);
369 if ( nLen != wxCONV_FAILED )
370 {
371 // now do the actual conversion
372 wxWCharBuffer buf(nLen /* +1 added implicitly */);
373
374 // +1 for the trailing NULL
375 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
376 return buf;
377 }
378 }
379
380 return wxWCharBuffer();
381 }
382
383 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
384 {
385 if ( pwz )
386 {
387 const size_t nLen = WC2MB(NULL, pwz, 0);
388 if ( nLen != wxCONV_FAILED )
389 {
390 // extra space for trailing NUL(s)
391 static const size_t extraLen = GetMaxMBNulLen();
392
393 wxCharBuffer buf(nLen + extraLen - 1);
394 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
395 return buf;
396 }
397 }
398
399 return wxCharBuffer();
400 }
401
402 const wxWCharBuffer
403 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
404 {
405 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
406 if ( dstLen != wxCONV_FAILED )
407 {
408 wxWCharBuffer wbuf(dstLen - 1);
409 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
410 {
411 if ( outLen )
412 {
413 *outLen = dstLen;
414 if ( wbuf[dstLen - 1] == L'\0' )
415 (*outLen)--;
416 }
417
418 return wbuf;
419 }
420 }
421
422 if ( outLen )
423 *outLen = 0;
424
425 return wxWCharBuffer();
426 }
427
428 const wxCharBuffer
429 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
430 {
431 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
432 if ( dstLen != wxCONV_FAILED )
433 {
434 // special case of empty input: can't allocate 0 size buffer below as
435 // wxCharBuffer insists on NUL-terminating it
436 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
437 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
438 {
439 if ( outLen )
440 {
441 *outLen = dstLen;
442
443 const size_t nulLen = GetMBNulLen();
444 if ( dstLen >= nulLen &&
445 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
446 {
447 // in this case the output is NUL-terminated and we're not
448 // supposed to count NUL
449 *outLen -= nulLen;
450 }
451 }
452
453 return buf;
454 }
455 }
456
457 if ( outLen )
458 *outLen = 0;
459
460 return wxCharBuffer();
461 }
462
463 // ----------------------------------------------------------------------------
464 // wxMBConvLibc
465 // ----------------------------------------------------------------------------
466
467 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
468 {
469 return wxMB2WC(buf, psz, n);
470 }
471
472 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
473 {
474 return wxWC2MB(buf, psz, n);
475 }
476
477 // ----------------------------------------------------------------------------
478 // wxConvBrokenFileNames
479 // ----------------------------------------------------------------------------
480
481 #ifdef __UNIX__
482
483 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
484 {
485 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
486 wxStricmp(charset, _T("UTF8")) == 0 )
487 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
488 else
489 m_conv = new wxCSConv(charset);
490 }
491
492 #endif // __UNIX__
493
494 // ----------------------------------------------------------------------------
495 // UTF-7
496 // ----------------------------------------------------------------------------
497
498 // Implementation (C) 2004 Fredrik Roubert
499
500 //
501 // BASE64 decoding table
502 //
503 static const unsigned char utf7unb64[] =
504 {
505 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
506 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
511 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
512 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
514 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
515 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
516 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
518 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
519 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
520 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
536 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
537 };
538
539 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
540 {
541 size_t len = 0;
542
543 while ( *psz && (!buf || (len < n)) )
544 {
545 unsigned char cc = *psz++;
546 if (cc != '+')
547 {
548 // plain ASCII char
549 if (buf)
550 *buf++ = cc;
551 len++;
552 }
553 else if (*psz == '-')
554 {
555 // encoded plus sign
556 if (buf)
557 *buf++ = cc;
558 len++;
559 psz++;
560 }
561 else // start of BASE64 encoded string
562 {
563 bool lsb, ok;
564 unsigned int d, l;
565 for ( ok = lsb = false, d = 0, l = 0;
566 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
567 psz++ )
568 {
569 d <<= 6;
570 d += cc;
571 for (l += 6; l >= 8; lsb = !lsb)
572 {
573 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
574 if (lsb)
575 {
576 if (buf)
577 *buf++ |= c;
578 len ++;
579 }
580 else
581 {
582 if (buf)
583 *buf = (wchar_t)(c << 8);
584 }
585
586 ok = true;
587 }
588 }
589
590 if ( !ok )
591 {
592 // in valid UTF7 we should have valid characters after '+'
593 return wxCONV_FAILED;
594 }
595
596 if (*psz == '-')
597 psz++;
598 }
599 }
600
601 if ( buf && (len < n) )
602 *buf = '\0';
603
604 return len;
605 }
606
607 //
608 // BASE64 encoding table
609 //
610 static const unsigned char utf7enb64[] =
611 {
612 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
613 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
614 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
615 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
616 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
617 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
618 'w', 'x', 'y', 'z', '0', '1', '2', '3',
619 '4', '5', '6', '7', '8', '9', '+', '/'
620 };
621
622 //
623 // UTF-7 encoding table
624 //
625 // 0 - Set D (directly encoded characters)
626 // 1 - Set O (optional direct characters)
627 // 2 - whitespace characters (optional)
628 // 3 - special characters
629 //
630 static const unsigned char utf7encode[128] =
631 {
632 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
633 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
634 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
635 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
636 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
638 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
640 };
641
642 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
643 {
644 size_t len = 0;
645
646 while (*psz && ((!buf) || (len < n)))
647 {
648 wchar_t cc = *psz++;
649 if (cc < 0x80 && utf7encode[cc] < 1)
650 {
651 // plain ASCII char
652 if (buf)
653 *buf++ = (char)cc;
654
655 len++;
656 }
657 #ifndef WC_UTF16
658 else if (((wxUint32)cc) > 0xffff)
659 {
660 // no surrogate pair generation (yet?)
661 return wxCONV_FAILED;
662 }
663 #endif
664 else
665 {
666 if (buf)
667 *buf++ = '+';
668
669 len++;
670 if (cc != '+')
671 {
672 // BASE64 encode string
673 unsigned int lsb, d, l;
674 for (d = 0, l = 0; /*nothing*/; psz++)
675 {
676 for (lsb = 0; lsb < 2; lsb ++)
677 {
678 d <<= 8;
679 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
680
681 for (l += 8; l >= 6; )
682 {
683 l -= 6;
684 if (buf)
685 *buf++ = utf7enb64[(d >> l) % 64];
686 len++;
687 }
688 }
689
690 cc = *psz;
691 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
692 break;
693 }
694
695 if (l != 0)
696 {
697 if (buf)
698 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
699
700 len++;
701 }
702 }
703
704 if (buf)
705 *buf++ = '-';
706 len++;
707 }
708 }
709
710 if (buf && (len < n))
711 *buf = 0;
712
713 return len;
714 }
715
716 // ----------------------------------------------------------------------------
717 // UTF-8
718 // ----------------------------------------------------------------------------
719
720 static wxUint32 utf8_max[]=
721 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
722
723 // boundaries of the private use area we use to (temporarily) remap invalid
724 // characters invalid in a UTF-8 encoded string
725 const wxUint32 wxUnicodePUA = 0x100000;
726 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
727
728 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
729 {
730 size_t len = 0;
731
732 while (*psz && ((!buf) || (len < n)))
733 {
734 const char *opsz = psz;
735 bool invalid = false;
736 unsigned char cc = *psz++, fc = cc;
737 unsigned cnt;
738 for (cnt = 0; fc & 0x80; cnt++)
739 fc <<= 1;
740
741 if (!cnt)
742 {
743 // plain ASCII char
744 if (buf)
745 *buf++ = cc;
746 len++;
747
748 // escape the escape character for octal escapes
749 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
750 && cc == '\\' && (!buf || len < n))
751 {
752 if (buf)
753 *buf++ = cc;
754 len++;
755 }
756 }
757 else
758 {
759 cnt--;
760 if (!cnt)
761 {
762 // invalid UTF-8 sequence
763 invalid = true;
764 }
765 else
766 {
767 unsigned ocnt = cnt - 1;
768 wxUint32 res = cc & (0x3f >> cnt);
769 while (cnt--)
770 {
771 cc = *psz;
772 if ((cc & 0xC0) != 0x80)
773 {
774 // invalid UTF-8 sequence
775 invalid = true;
776 break;
777 }
778
779 psz++;
780 res = (res << 6) | (cc & 0x3f);
781 }
782
783 if (invalid || res <= utf8_max[ocnt])
784 {
785 // illegal UTF-8 encoding
786 invalid = true;
787 }
788 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
789 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
790 {
791 // if one of our PUA characters turns up externally
792 // it must also be treated as an illegal sequence
793 // (a bit like you have to escape an escape character)
794 invalid = true;
795 }
796 else
797 {
798 #ifdef WC_UTF16
799 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
800 size_t pa = encode_utf16(res, (wxUint16 *)buf);
801 if (pa == wxCONV_FAILED)
802 {
803 invalid = true;
804 }
805 else
806 {
807 if (buf)
808 buf += pa;
809 len += pa;
810 }
811 #else // !WC_UTF16
812 if (buf)
813 *buf++ = (wchar_t)res;
814 len++;
815 #endif // WC_UTF16/!WC_UTF16
816 }
817 }
818
819 if (invalid)
820 {
821 if (m_options & MAP_INVALID_UTF8_TO_PUA)
822 {
823 while (opsz < psz && (!buf || len < n))
824 {
825 #ifdef WC_UTF16
826 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
827 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
828 wxASSERT(pa != wxCONV_FAILED);
829 if (buf)
830 buf += pa;
831 opsz++;
832 len += pa;
833 #else
834 if (buf)
835 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
836 opsz++;
837 len++;
838 #endif
839 }
840 }
841 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
842 {
843 while (opsz < psz && (!buf || len < n))
844 {
845 if ( buf && len + 3 < n )
846 {
847 unsigned char on = *opsz;
848 *buf++ = L'\\';
849 *buf++ = (wchar_t)( L'0' + on / 0100 );
850 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
851 *buf++ = (wchar_t)( L'0' + on % 010 );
852 }
853
854 opsz++;
855 len += 4;
856 }
857 }
858 else // MAP_INVALID_UTF8_NOT
859 {
860 return wxCONV_FAILED;
861 }
862 }
863 }
864 }
865
866 if (buf && (len < n))
867 *buf = 0;
868
869 return len;
870 }
871
872 static inline bool isoctal(wchar_t wch)
873 {
874 return L'0' <= wch && wch <= L'7';
875 }
876
877 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
878 {
879 size_t len = 0;
880
881 while (*psz && ((!buf) || (len < n)))
882 {
883 wxUint32 cc;
884
885 #ifdef WC_UTF16
886 // cast is ok for WC_UTF16
887 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
888 psz += (pa == wxCONV_FAILED) ? 1 : pa;
889 #else
890 cc = (*psz++) & 0x7fffffff;
891 #endif
892
893 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
894 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
895 {
896 if (buf)
897 *buf++ = (char)(cc - wxUnicodePUA);
898 len++;
899 }
900 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
901 && cc == L'\\' && psz[0] == L'\\' )
902 {
903 if (buf)
904 *buf++ = (char)cc;
905 psz++;
906 len++;
907 }
908 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
909 cc == L'\\' &&
910 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
911 {
912 if (buf)
913 {
914 *buf++ = (char) ((psz[0] - L'0') * 0100 +
915 (psz[1] - L'0') * 010 +
916 (psz[2] - L'0'));
917 }
918
919 psz += 3;
920 len++;
921 }
922 else
923 {
924 unsigned cnt;
925 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
926 {
927 }
928
929 if (!cnt)
930 {
931 // plain ASCII char
932 if (buf)
933 *buf++ = (char) cc;
934 len++;
935 }
936 else
937 {
938 len += cnt + 1;
939 if (buf)
940 {
941 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
942 while (cnt--)
943 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
944 }
945 }
946 }
947 }
948
949 if (buf && (len < n))
950 *buf = 0;
951
952 return len;
953 }
954
955 // ============================================================================
956 // UTF-16
957 // ============================================================================
958
959 #ifdef WORDS_BIGENDIAN
960 #define wxMBConvUTF16straight wxMBConvUTF16BE
961 #define wxMBConvUTF16swap wxMBConvUTF16LE
962 #else
963 #define wxMBConvUTF16swap wxMBConvUTF16BE
964 #define wxMBConvUTF16straight wxMBConvUTF16LE
965 #endif
966
967 /* static */
968 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
969 {
970 if ( srcLen == wxNO_LEN )
971 {
972 // count the number of bytes in input, including the trailing NULs
973 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
974 for ( srcLen = 1; *inBuff++; srcLen++ )
975 ;
976
977 srcLen *= BYTES_PER_CHAR;
978 }
979 else // we already have the length
980 {
981 // we can only convert an entire number of UTF-16 characters
982 if ( srcLen % BYTES_PER_CHAR )
983 return wxCONV_FAILED;
984 }
985
986 return srcLen;
987 }
988
989 // case when in-memory representation is UTF-16 too
990 #ifdef WC_UTF16
991
992 // ----------------------------------------------------------------------------
993 // conversions without endianness change
994 // ----------------------------------------------------------------------------
995
996 size_t
997 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
998 const char *src, size_t srcLen) const
999 {
1000 // set up the scene for using memcpy() (which is presumably more efficient
1001 // than copying the bytes one by one)
1002 srcLen = GetLength(src, srcLen);
1003 if ( srcLen == wxNO_LEN )
1004 return wxCONV_FAILED;
1005
1006 const size_t inLen = srcLen / BYTES_PER_CHAR;
1007 if ( dst )
1008 {
1009 if ( dstLen < inLen )
1010 return wxCONV_FAILED;
1011
1012 memcpy(dst, src, srcLen);
1013 }
1014
1015 return inLen;
1016 }
1017
1018 size_t
1019 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1020 const wchar_t *src, size_t srcLen) const
1021 {
1022 if ( srcLen == wxNO_LEN )
1023 srcLen = wxWcslen(src) + 1;
1024
1025 srcLen *= BYTES_PER_CHAR;
1026
1027 if ( dst )
1028 {
1029 if ( dstLen < srcLen )
1030 return wxCONV_FAILED;
1031
1032 memcpy(dst, src, srcLen);
1033 }
1034
1035 return srcLen;
1036 }
1037
1038 // ----------------------------------------------------------------------------
1039 // endian-reversing conversions
1040 // ----------------------------------------------------------------------------
1041
1042 size_t
1043 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1044 const char *src, size_t srcLen) const
1045 {
1046 srcLen = GetLength(src, srcLen);
1047 if ( srcLen == wxNO_LEN )
1048 return wxCONV_FAILED;
1049
1050 srcLen /= BYTES_PER_CHAR;
1051
1052 if ( dst )
1053 {
1054 if ( dstLen < srcLen )
1055 return wxCONV_FAILED;
1056
1057 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1058 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1059 {
1060 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1061 }
1062 }
1063
1064 return srcLen;
1065 }
1066
1067 size_t
1068 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1069 const wchar_t *src, size_t srcLen) const
1070 {
1071 if ( srcLen == wxNO_LEN )
1072 srcLen = wxWcslen(src) + 1;
1073
1074 srcLen *= BYTES_PER_CHAR;
1075
1076 if ( dst )
1077 {
1078 if ( dstLen < srcLen )
1079 return wxCONV_FAILED;
1080
1081 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1082 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1083 {
1084 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1085 }
1086 }
1087
1088 return srcLen;
1089 }
1090
1091 #else // !WC_UTF16: wchar_t is UTF-32
1092
1093 // ----------------------------------------------------------------------------
1094 // conversions without endianness change
1095 // ----------------------------------------------------------------------------
1096
1097 size_t
1098 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1099 const char *src, size_t srcLen) const
1100 {
1101 srcLen = GetLength(src, srcLen);
1102 if ( srcLen == wxNO_LEN )
1103 return wxCONV_FAILED;
1104
1105 const size_t inLen = srcLen / BYTES_PER_CHAR;
1106 if ( !dst )
1107 {
1108 // optimization: return maximal space which could be needed for this
1109 // string even if the real size could be smaller if the buffer contains
1110 // any surrogates
1111 return inLen;
1112 }
1113
1114 size_t outLen = 0;
1115 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1116 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1117 {
1118 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1119 if ( !inBuff )
1120 return wxCONV_FAILED;
1121
1122 if ( ++outLen > dstLen )
1123 return wxCONV_FAILED;
1124
1125 *dst++ = ch;
1126 }
1127
1128
1129 return outLen;
1130 }
1131
1132 size_t
1133 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1134 const wchar_t *src, size_t srcLen) const
1135 {
1136 if ( srcLen == wxNO_LEN )
1137 srcLen = wxWcslen(src) + 1;
1138
1139 size_t outLen = 0;
1140 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1141 for ( size_t n = 0; n < srcLen; n++ )
1142 {
1143 wxUint16 cc[2];
1144 const size_t numChars = encode_utf16(*src++, cc);
1145 if ( numChars == wxCONV_FAILED )
1146 return wxCONV_FAILED;
1147
1148 outLen += numChars * BYTES_PER_CHAR;
1149 if ( outBuff )
1150 {
1151 if ( outLen > dstLen )
1152 return wxCONV_FAILED;
1153
1154 *outBuff++ = cc[0];
1155 if ( numChars == 2 )
1156 {
1157 // second character of a surrogate
1158 *outBuff++ = cc[1];
1159 }
1160 }
1161 }
1162
1163 return outLen;
1164 }
1165
1166 // ----------------------------------------------------------------------------
1167 // endian-reversing conversions
1168 // ----------------------------------------------------------------------------
1169
1170 size_t
1171 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1172 const char *src, size_t srcLen) const
1173 {
1174 srcLen = GetLength(src, srcLen);
1175 if ( srcLen == wxNO_LEN )
1176 return wxCONV_FAILED;
1177
1178 const size_t inLen = srcLen / BYTES_PER_CHAR;
1179 if ( !dst )
1180 {
1181 // optimization: return maximal space which could be needed for this
1182 // string even if the real size could be smaller if the buffer contains
1183 // any surrogates
1184 return inLen;
1185 }
1186
1187 size_t outLen = 0;
1188 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1189 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1190 {
1191 wxUint32 ch;
1192 wxUint16 tmp[2];
1193
1194 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1195 inBuff++;
1196 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1197
1198 const size_t numChars = decode_utf16(tmp, ch);
1199 if ( numChars == wxCONV_FAILED )
1200 return wxCONV_FAILED;
1201
1202 if ( numChars == 2 )
1203 inBuff++;
1204
1205 if ( ++outLen > dstLen )
1206 return wxCONV_FAILED;
1207
1208 *dst++ = ch;
1209 }
1210
1211
1212 return outLen;
1213 }
1214
1215 size_t
1216 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1217 const wchar_t *src, size_t srcLen) const
1218 {
1219 if ( srcLen == wxNO_LEN )
1220 srcLen = wxWcslen(src) + 1;
1221
1222 size_t outLen = 0;
1223 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1224 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1225 {
1226 wxUint16 cc[2];
1227 const size_t numChars = encode_utf16(*src, cc);
1228 if ( numChars == wxCONV_FAILED )
1229 return wxCONV_FAILED;
1230
1231 outLen += numChars * BYTES_PER_CHAR;
1232 if ( outBuff )
1233 {
1234 if ( outLen > dstLen )
1235 return wxCONV_FAILED;
1236
1237 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1238 if ( numChars == 2 )
1239 {
1240 // second character of a surrogate
1241 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1242 }
1243 }
1244 }
1245
1246 return outLen;
1247 }
1248
1249 #endif // WC_UTF16/!WC_UTF16
1250
1251
1252 // ============================================================================
1253 // UTF-32
1254 // ============================================================================
1255
1256 #ifdef WORDS_BIGENDIAN
1257 #define wxMBConvUTF32straight wxMBConvUTF32BE
1258 #define wxMBConvUTF32swap wxMBConvUTF32LE
1259 #else
1260 #define wxMBConvUTF32swap wxMBConvUTF32BE
1261 #define wxMBConvUTF32straight wxMBConvUTF32LE
1262 #endif
1263
1264
1265 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1266 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1267
1268 /* static */
1269 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1270 {
1271 if ( srcLen == wxNO_LEN )
1272 {
1273 // count the number of bytes in input, including the trailing NULs
1274 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1275 for ( srcLen = 1; *inBuff++; srcLen++ )
1276 ;
1277
1278 srcLen *= BYTES_PER_CHAR;
1279 }
1280 else // we already have the length
1281 {
1282 // we can only convert an entire number of UTF-32 characters
1283 if ( srcLen % BYTES_PER_CHAR )
1284 return wxCONV_FAILED;
1285 }
1286
1287 return srcLen;
1288 }
1289
1290 // case when in-memory representation is UTF-16
1291 #ifdef WC_UTF16
1292
1293 // ----------------------------------------------------------------------------
1294 // conversions without endianness change
1295 // ----------------------------------------------------------------------------
1296
1297 size_t
1298 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1299 const char *src, size_t srcLen) const
1300 {
1301 srcLen = GetLength(src, srcLen);
1302 if ( srcLen == wxNO_LEN )
1303 return wxCONV_FAILED;
1304
1305 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1306 const size_t inLen = srcLen / BYTES_PER_CHAR;
1307 size_t outLen = 0;
1308 for ( size_t n = 0; n < inLen; n++ )
1309 {
1310 wxUint16 cc[2];
1311 const size_t numChars = encode_utf16(*inBuff++, cc);
1312 if ( numChars == wxCONV_FAILED )
1313 return wxCONV_FAILED;
1314
1315 outLen += numChars;
1316 if ( dst )
1317 {
1318 if ( outLen > dstLen )
1319 return wxCONV_FAILED;
1320
1321 *dst++ = cc[0];
1322 if ( numChars == 2 )
1323 {
1324 // second character of a surrogate
1325 *dst++ = cc[1];
1326 }
1327 }
1328 }
1329
1330 return outLen;
1331 }
1332
1333 size_t
1334 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1335 const wchar_t *src, size_t srcLen) const
1336 {
1337 if ( srcLen == wxNO_LEN )
1338 srcLen = wxWcslen(src) + 1;
1339
1340 if ( !dst )
1341 {
1342 // optimization: return maximal space which could be needed for this
1343 // string instead of the exact amount which could be less if there are
1344 // any surrogates in the input
1345 //
1346 // we consider that surrogates are rare enough to make it worthwhile to
1347 // avoid running the loop below at the cost of slightly extra memory
1348 // consumption
1349 return srcLen * BYTES_PER_CHAR;
1350 }
1351
1352 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1353 size_t outLen = 0;
1354 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1355 {
1356 const wxUint32 ch = wxDecodeSurrogate(&src);
1357 if ( !src )
1358 return wxCONV_FAILED;
1359
1360 outLen += BYTES_PER_CHAR;
1361
1362 if ( outLen > dstLen )
1363 return wxCONV_FAILED;
1364
1365 *outBuff++ = ch;
1366 }
1367
1368 return outLen;
1369 }
1370
1371 // ----------------------------------------------------------------------------
1372 // endian-reversing conversions
1373 // ----------------------------------------------------------------------------
1374
1375 size_t
1376 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1377 const char *src, size_t srcLen) const
1378 {
1379 srcLen = GetLength(src, srcLen);
1380 if ( srcLen == wxNO_LEN )
1381 return wxCONV_FAILED;
1382
1383 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1384 const size_t inLen = srcLen / BYTES_PER_CHAR;
1385 size_t outLen = 0;
1386 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1387 {
1388 wxUint16 cc[2];
1389 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1390 if ( numChars == wxCONV_FAILED )
1391 return wxCONV_FAILED;
1392
1393 outLen += numChars;
1394 if ( dst )
1395 {
1396 if ( outLen > dstLen )
1397 return wxCONV_FAILED;
1398
1399 *dst++ = cc[0];
1400 if ( numChars == 2 )
1401 {
1402 // second character of a surrogate
1403 *dst++ = cc[1];
1404 }
1405 }
1406 }
1407
1408 return outLen;
1409 }
1410
1411 size_t
1412 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1413 const wchar_t *src, size_t srcLen) const
1414 {
1415 if ( srcLen == wxNO_LEN )
1416 srcLen = wxWcslen(src) + 1;
1417
1418 if ( !dst )
1419 {
1420 // optimization: return maximal space which could be needed for this
1421 // string instead of the exact amount which could be less if there are
1422 // any surrogates in the input
1423 //
1424 // we consider that surrogates are rare enough to make it worthwhile to
1425 // avoid running the loop below at the cost of slightly extra memory
1426 // consumption
1427 return srcLen*BYTES_PER_CHAR;
1428 }
1429
1430 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1431 size_t outLen = 0;
1432 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1433 {
1434 const wxUint32 ch = wxDecodeSurrogate(&src);
1435 if ( !src )
1436 return wxCONV_FAILED;
1437
1438 outLen += BYTES_PER_CHAR;
1439
1440 if ( outLen > dstLen )
1441 return wxCONV_FAILED;
1442
1443 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1444 }
1445
1446 return outLen;
1447 }
1448
1449 #else // !WC_UTF16: wchar_t is UTF-32
1450
1451 // ----------------------------------------------------------------------------
1452 // conversions without endianness change
1453 // ----------------------------------------------------------------------------
1454
1455 size_t
1456 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1457 const char *src, size_t srcLen) const
1458 {
1459 // use memcpy() as it should be much faster than hand-written loop
1460 srcLen = GetLength(src, srcLen);
1461 if ( srcLen == wxNO_LEN )
1462 return wxCONV_FAILED;
1463
1464 const size_t inLen = srcLen/BYTES_PER_CHAR;
1465 if ( dst )
1466 {
1467 if ( dstLen < inLen )
1468 return wxCONV_FAILED;
1469
1470 memcpy(dst, src, srcLen);
1471 }
1472
1473 return inLen;
1474 }
1475
1476 size_t
1477 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1478 const wchar_t *src, size_t srcLen) const
1479 {
1480 if ( srcLen == wxNO_LEN )
1481 srcLen = wxWcslen(src) + 1;
1482
1483 srcLen *= BYTES_PER_CHAR;
1484
1485 if ( dst )
1486 {
1487 if ( dstLen < srcLen )
1488 return wxCONV_FAILED;
1489
1490 memcpy(dst, src, srcLen);
1491 }
1492
1493 return srcLen;
1494 }
1495
1496 // ----------------------------------------------------------------------------
1497 // endian-reversing conversions
1498 // ----------------------------------------------------------------------------
1499
1500 size_t
1501 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1502 const char *src, size_t srcLen) const
1503 {
1504 srcLen = GetLength(src, srcLen);
1505 if ( srcLen == wxNO_LEN )
1506 return wxCONV_FAILED;
1507
1508 srcLen /= BYTES_PER_CHAR;
1509
1510 if ( dst )
1511 {
1512 if ( dstLen < srcLen )
1513 return wxCONV_FAILED;
1514
1515 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1516 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1517 {
1518 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1519 }
1520 }
1521
1522 return srcLen;
1523 }
1524
1525 size_t
1526 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1527 const wchar_t *src, size_t srcLen) const
1528 {
1529 if ( srcLen == wxNO_LEN )
1530 srcLen = wxWcslen(src) + 1;
1531
1532 srcLen *= BYTES_PER_CHAR;
1533
1534 if ( dst )
1535 {
1536 if ( dstLen < srcLen )
1537 return wxCONV_FAILED;
1538
1539 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1540 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1541 {
1542 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1543 }
1544 }
1545
1546 return srcLen;
1547 }
1548
1549 #endif // WC_UTF16/!WC_UTF16
1550
1551
1552 // ============================================================================
1553 // The classes doing conversion using the iconv_xxx() functions
1554 // ============================================================================
1555
1556 #ifdef HAVE_ICONV
1557
1558 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1559 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1560 // (unless there's yet another bug in glibc) the only case when iconv()
1561 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1562 // left in the input buffer -- when _real_ error occurs,
1563 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1564 // iconv() failure.
1565 // [This bug does not appear in glibc 2.2.]
1566 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1567 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1568 (errno != E2BIG || bufLeft != 0))
1569 #else
1570 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1571 #endif
1572
1573 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1574
1575 #define ICONV_T_INVALID ((iconv_t)-1)
1576
1577 #if SIZEOF_WCHAR_T == 4
1578 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1579 #define WC_ENC wxFONTENCODING_UTF32
1580 #elif SIZEOF_WCHAR_T == 2
1581 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1582 #define WC_ENC wxFONTENCODING_UTF16
1583 #else // sizeof(wchar_t) != 2 nor 4
1584 // does this ever happen?
1585 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1586 #endif
1587
1588 // ----------------------------------------------------------------------------
1589 // wxMBConv_iconv: encapsulates an iconv character set
1590 // ----------------------------------------------------------------------------
1591
1592 class wxMBConv_iconv : public wxMBConv
1593 {
1594 public:
1595 wxMBConv_iconv(const char *name);
1596 virtual ~wxMBConv_iconv();
1597
1598 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1599 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1600
1601 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1602 virtual size_t GetMBNulLen() const;
1603
1604 #if wxUSE_UNICODE_UTF8
1605 virtual bool IsUTF8() const;
1606 #endif
1607
1608 virtual wxMBConv *Clone() const
1609 {
1610 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1611 p->m_minMBCharWidth = m_minMBCharWidth;
1612 return p;
1613 }
1614
1615 bool IsOk() const
1616 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1617
1618 protected:
1619 // the iconv handlers used to translate from multibyte
1620 // to wide char and in the other direction
1621 iconv_t m2w,
1622 w2m;
1623
1624 #if wxUSE_THREADS
1625 // guards access to m2w and w2m objects
1626 wxMutex m_iconvMutex;
1627 #endif
1628
1629 private:
1630 // the name (for iconv_open()) of a wide char charset -- if none is
1631 // available on this machine, it will remain NULL
1632 static wxString ms_wcCharsetName;
1633
1634 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1635 // different endian-ness than the native one
1636 static bool ms_wcNeedsSwap;
1637
1638
1639 // name of the encoding handled by this conversion
1640 wxString m_name;
1641
1642 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1643 // initially
1644 size_t m_minMBCharWidth;
1645 };
1646
1647 // make the constructor available for unit testing
1648 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1649 {
1650 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1651 if ( !result->IsOk() )
1652 {
1653 delete result;
1654 return 0;
1655 }
1656
1657 return result;
1658 }
1659
1660 wxString wxMBConv_iconv::ms_wcCharsetName;
1661 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1662
1663 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1664 : m_name(name)
1665 {
1666 m_minMBCharWidth = 0;
1667
1668 // check for charset that represents wchar_t:
1669 if ( ms_wcCharsetName.empty() )
1670 {
1671 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1672
1673 #if wxUSE_FONTMAP
1674 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1675 #else // !wxUSE_FONTMAP
1676 static const wxChar *names_static[] =
1677 {
1678 #if SIZEOF_WCHAR_T == 4
1679 _T("UCS-4"),
1680 #elif SIZEOF_WCHAR_T = 2
1681 _T("UCS-2"),
1682 #endif
1683 NULL
1684 };
1685 const wxChar **names = names_static;
1686 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1687
1688 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1689 {
1690 const wxString nameCS(*names);
1691
1692 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1693 wxString nameXE(nameCS);
1694
1695 #ifdef WORDS_BIGENDIAN
1696 nameXE += _T("BE");
1697 #else // little endian
1698 nameXE += _T("LE");
1699 #endif
1700
1701 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1702 nameXE.c_str());
1703
1704 m2w = iconv_open(nameXE.ToAscii(), name);
1705 if ( m2w == ICONV_T_INVALID )
1706 {
1707 // try charset w/o bytesex info (e.g. "UCS4")
1708 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1709 nameCS.c_str());
1710 m2w = iconv_open(nameCS.ToAscii(), name);
1711
1712 // and check for bytesex ourselves:
1713 if ( m2w != ICONV_T_INVALID )
1714 {
1715 char buf[2], *bufPtr;
1716 wchar_t wbuf[2], *wbufPtr;
1717 size_t insz, outsz;
1718 size_t res;
1719
1720 buf[0] = 'A';
1721 buf[1] = 0;
1722 wbuf[0] = 0;
1723 insz = 2;
1724 outsz = SIZEOF_WCHAR_T * 2;
1725 wbufPtr = wbuf;
1726 bufPtr = buf;
1727
1728 res = iconv(
1729 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1730 (char**)&wbufPtr, &outsz);
1731
1732 if (ICONV_FAILED(res, insz))
1733 {
1734 wxLogLastError(wxT("iconv"));
1735 wxLogError(_("Conversion to charset '%s' doesn't work."),
1736 nameCS.c_str());
1737 }
1738 else // ok, can convert to this encoding, remember it
1739 {
1740 ms_wcCharsetName = nameCS;
1741 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1742 }
1743 }
1744 }
1745 else // use charset not requiring byte swapping
1746 {
1747 ms_wcCharsetName = nameXE;
1748 }
1749 }
1750
1751 wxLogTrace(TRACE_STRCONV,
1752 wxT("iconv wchar_t charset is \"%s\"%s"),
1753 ms_wcCharsetName.empty() ? wxString("<none>")
1754 : ms_wcCharsetName,
1755 ms_wcNeedsSwap ? _T(" (needs swap)")
1756 : _T(""));
1757 }
1758 else // we already have ms_wcCharsetName
1759 {
1760 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
1761 }
1762
1763 if ( ms_wcCharsetName.empty() )
1764 {
1765 w2m = ICONV_T_INVALID;
1766 }
1767 else
1768 {
1769 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
1770 if ( w2m == ICONV_T_INVALID )
1771 {
1772 wxLogTrace(TRACE_STRCONV,
1773 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1774 ms_wcCharsetName.c_str(), name);
1775 }
1776 }
1777 }
1778
1779 wxMBConv_iconv::~wxMBConv_iconv()
1780 {
1781 if ( m2w != ICONV_T_INVALID )
1782 iconv_close(m2w);
1783 if ( w2m != ICONV_T_INVALID )
1784 iconv_close(w2m);
1785 }
1786
1787 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1788 {
1789 // find the string length: notice that must be done differently for
1790 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1791 size_t inbuf;
1792 const size_t nulLen = GetMBNulLen();
1793 switch ( nulLen )
1794 {
1795 default:
1796 return wxCONV_FAILED;
1797
1798 case 1:
1799 inbuf = strlen(psz); // arguably more optimized than our version
1800 break;
1801
1802 case 2:
1803 case 4:
1804 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1805 // they also have to start at character boundary and not span two
1806 // adjacent characters
1807 const char *p;
1808 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1809 ;
1810 inbuf = p - psz;
1811 break;
1812 }
1813
1814 #if wxUSE_THREADS
1815 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1816 // Unfortunately there are a couple of global wxCSConv objects such as
1817 // wxConvLocal that are used all over wx code, so we have to make sure
1818 // the handle is used by at most one thread at the time. Otherwise
1819 // only a few wx classes would be safe to use from non-main threads
1820 // as MB<->WC conversion would fail "randomly".
1821 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1822 #endif // wxUSE_THREADS
1823
1824 size_t outbuf = n * SIZEOF_WCHAR_T;
1825 size_t res, cres;
1826 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1827 wchar_t *bufPtr = buf;
1828 const char *pszPtr = psz;
1829
1830 if (buf)
1831 {
1832 // have destination buffer, convert there
1833 cres = iconv(m2w,
1834 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1835 (char**)&bufPtr, &outbuf);
1836 res = n - (outbuf / SIZEOF_WCHAR_T);
1837
1838 if (ms_wcNeedsSwap)
1839 {
1840 // convert to native endianness
1841 for ( unsigned i = 0; i < res; i++ )
1842 buf[n] = WC_BSWAP(buf[i]);
1843 }
1844
1845 // NUL-terminate the string if there is any space left
1846 if (res < n)
1847 buf[res] = 0;
1848 }
1849 else
1850 {
1851 // no destination buffer... convert using temp buffer
1852 // to calculate destination buffer requirement
1853 wchar_t tbuf[8];
1854 res = 0;
1855
1856 do
1857 {
1858 bufPtr = tbuf;
1859 outbuf = 8 * SIZEOF_WCHAR_T;
1860
1861 cres = iconv(m2w,
1862 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1863 (char**)&bufPtr, &outbuf );
1864
1865 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1866 }
1867 while ((cres == (size_t)-1) && (errno == E2BIG));
1868 }
1869
1870 if (ICONV_FAILED(cres, inbuf))
1871 {
1872 //VS: it is ok if iconv fails, hence trace only
1873 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1874 return wxCONV_FAILED;
1875 }
1876
1877 return res;
1878 }
1879
1880 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1881 {
1882 #if wxUSE_THREADS
1883 // NB: explained in MB2WC
1884 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1885 #endif
1886
1887 size_t inlen = wxWcslen(psz);
1888 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1889 size_t outbuf = n;
1890 size_t res, cres;
1891
1892 wchar_t *tmpbuf = 0;
1893
1894 if (ms_wcNeedsSwap)
1895 {
1896 // need to copy to temp buffer to switch endianness
1897 // (doing WC_BSWAP twice on the original buffer won't help, as it
1898 // could be in read-only memory, or be accessed in some other thread)
1899 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1900 for ( size_t i = 0; i < inlen; i++ )
1901 tmpbuf[n] = WC_BSWAP(psz[i]);
1902
1903 tmpbuf[inlen] = L'\0';
1904 psz = tmpbuf;
1905 }
1906
1907 if (buf)
1908 {
1909 // have destination buffer, convert there
1910 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1911
1912 res = n - outbuf;
1913
1914 // NB: iconv was given only wcslen(psz) characters on input, and so
1915 // it couldn't convert the trailing zero. Let's do it ourselves
1916 // if there's some room left for it in the output buffer.
1917 if (res < n)
1918 buf[0] = 0;
1919 }
1920 else
1921 {
1922 // no destination buffer: convert using temp buffer
1923 // to calculate destination buffer requirement
1924 char tbuf[16];
1925 res = 0;
1926 do
1927 {
1928 buf = tbuf;
1929 outbuf = 16;
1930
1931 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1932
1933 res += 16 - outbuf;
1934 }
1935 while ((cres == (size_t)-1) && (errno == E2BIG));
1936 }
1937
1938 if (ms_wcNeedsSwap)
1939 {
1940 free(tmpbuf);
1941 }
1942
1943 if (ICONV_FAILED(cres, inbuf))
1944 {
1945 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1946 return wxCONV_FAILED;
1947 }
1948
1949 return res;
1950 }
1951
1952 size_t wxMBConv_iconv::GetMBNulLen() const
1953 {
1954 if ( m_minMBCharWidth == 0 )
1955 {
1956 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1957
1958 #if wxUSE_THREADS
1959 // NB: explained in MB2WC
1960 wxMutexLocker lock(self->m_iconvMutex);
1961 #endif
1962
1963 const wchar_t *wnul = L"";
1964 char buf[8]; // should be enough for NUL in any encoding
1965 size_t inLen = sizeof(wchar_t),
1966 outLen = WXSIZEOF(buf);
1967 char *inBuff = (char *)wnul;
1968 char *outBuff = buf;
1969 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1970 {
1971 self->m_minMBCharWidth = (size_t)-1;
1972 }
1973 else // ok
1974 {
1975 self->m_minMBCharWidth = outBuff - buf;
1976 }
1977 }
1978
1979 return m_minMBCharWidth;
1980 }
1981
1982 #if wxUSE_UNICODE_UTF8
1983 bool wxMBConv_iconv::IsUTF8() const
1984 {
1985 return wxStricmp(m_name, "UTF-8") == 0 ||
1986 wxStricmp(m_name, "UTF8") == 0;
1987 }
1988 #endif
1989
1990 #endif // HAVE_ICONV
1991
1992
1993 // ============================================================================
1994 // Win32 conversion classes
1995 // ============================================================================
1996
1997 #ifdef wxHAVE_WIN32_MB2WC
1998
1999 // from utils.cpp
2000 #if wxUSE_FONTMAP
2001 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2002 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2003 #endif
2004
2005 class wxMBConv_win32 : public wxMBConv
2006 {
2007 public:
2008 wxMBConv_win32()
2009 {
2010 m_CodePage = CP_ACP;
2011 m_minMBCharWidth = 0;
2012 }
2013
2014 wxMBConv_win32(const wxMBConv_win32& conv)
2015 : wxMBConv()
2016 {
2017 m_CodePage = conv.m_CodePage;
2018 m_minMBCharWidth = conv.m_minMBCharWidth;
2019 }
2020
2021 #if wxUSE_FONTMAP
2022 wxMBConv_win32(const char* name)
2023 {
2024 m_CodePage = wxCharsetToCodepage(name);
2025 m_minMBCharWidth = 0;
2026 }
2027
2028 wxMBConv_win32(wxFontEncoding encoding)
2029 {
2030 m_CodePage = wxEncodingToCodepage(encoding);
2031 m_minMBCharWidth = 0;
2032 }
2033 #endif // wxUSE_FONTMAP
2034
2035 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2036 {
2037 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2038 // the behaviour is not compatible with the Unix version (using iconv)
2039 // and break the library itself, e.g. wxTextInputStream::NextChar()
2040 // wouldn't work if reading an incomplete MB char didn't result in an
2041 // error
2042 //
2043 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2044 // Win XP or newer and it is not supported for UTF-[78] so we always
2045 // use our own conversions in this case. See
2046 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2047 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2048 if ( m_CodePage == CP_UTF8 )
2049 {
2050 return wxMBConvUTF8().MB2WC(buf, psz, n);
2051 }
2052
2053 if ( m_CodePage == CP_UTF7 )
2054 {
2055 return wxMBConvUTF7().MB2WC(buf, psz, n);
2056 }
2057
2058 int flags = 0;
2059 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2060 IsAtLeastWin2kSP4() )
2061 {
2062 flags = MB_ERR_INVALID_CHARS;
2063 }
2064
2065 const size_t len = ::MultiByteToWideChar
2066 (
2067 m_CodePage, // code page
2068 flags, // flags: fall on error
2069 psz, // input string
2070 -1, // its length (NUL-terminated)
2071 buf, // output string
2072 buf ? n : 0 // size of output buffer
2073 );
2074 if ( !len )
2075 {
2076 // function totally failed
2077 return wxCONV_FAILED;
2078 }
2079
2080 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2081 // check if we succeeded, by doing a double trip:
2082 if ( !flags && buf )
2083 {
2084 const size_t mbLen = strlen(psz);
2085 wxCharBuffer mbBuf(mbLen);
2086 if ( ::WideCharToMultiByte
2087 (
2088 m_CodePage,
2089 0,
2090 buf,
2091 -1,
2092 mbBuf.data(),
2093 mbLen + 1, // size in bytes, not length
2094 NULL,
2095 NULL
2096 ) == 0 ||
2097 strcmp(mbBuf, psz) != 0 )
2098 {
2099 // we didn't obtain the same thing we started from, hence
2100 // the conversion was lossy and we consider that it failed
2101 return wxCONV_FAILED;
2102 }
2103 }
2104
2105 // note that it returns count of written chars for buf != NULL and size
2106 // of the needed buffer for buf == NULL so in either case the length of
2107 // the string (which never includes the terminating NUL) is one less
2108 return len - 1;
2109 }
2110
2111 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2112 {
2113 /*
2114 we have a problem here: by default, WideCharToMultiByte() may
2115 replace characters unrepresentable in the target code page with bad
2116 quality approximations such as turning "1/2" symbol (U+00BD) into
2117 "1" for the code pages which don't have it and we, obviously, want
2118 to avoid this at any price
2119
2120 the trouble is that this function does it _silently_, i.e. it won't
2121 even tell us whether it did or not... Win98/2000 and higher provide
2122 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2123 we have to resort to a round trip, i.e. check that converting back
2124 results in the same string -- this is, of course, expensive but
2125 otherwise we simply can't be sure to not garble the data.
2126 */
2127
2128 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2129 // it doesn't work with CJK encodings (which we test for rather roughly
2130 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2131 // supporting it
2132 BOOL usedDef wxDUMMY_INITIALIZE(false);
2133 BOOL *pUsedDef;
2134 int flags;
2135 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2136 {
2137 // it's our lucky day
2138 flags = WC_NO_BEST_FIT_CHARS;
2139 pUsedDef = &usedDef;
2140 }
2141 else // old system or unsupported encoding
2142 {
2143 flags = 0;
2144 pUsedDef = NULL;
2145 }
2146
2147 const size_t len = ::WideCharToMultiByte
2148 (
2149 m_CodePage, // code page
2150 flags, // either none or no best fit
2151 pwz, // input string
2152 -1, // it is (wide) NUL-terminated
2153 buf, // output buffer
2154 buf ? n : 0, // and its size
2155 NULL, // default "replacement" char
2156 pUsedDef // [out] was it used?
2157 );
2158
2159 if ( !len )
2160 {
2161 // function totally failed
2162 return wxCONV_FAILED;
2163 }
2164
2165 // if we were really converting, check if we succeeded
2166 if ( buf )
2167 {
2168 if ( flags )
2169 {
2170 // check if the conversion failed, i.e. if any replacements
2171 // were done
2172 if ( usedDef )
2173 return wxCONV_FAILED;
2174 }
2175 else // we must resort to double tripping...
2176 {
2177 wxWCharBuffer wcBuf(n);
2178 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2179 wcscmp(wcBuf, pwz) != 0 )
2180 {
2181 // we didn't obtain the same thing we started from, hence
2182 // the conversion was lossy and we consider that it failed
2183 return wxCONV_FAILED;
2184 }
2185 }
2186 }
2187
2188 // see the comment above for the reason of "len - 1"
2189 return len - 1;
2190 }
2191
2192 virtual size_t GetMBNulLen() const
2193 {
2194 if ( m_minMBCharWidth == 0 )
2195 {
2196 int len = ::WideCharToMultiByte
2197 (
2198 m_CodePage, // code page
2199 0, // no flags
2200 L"", // input string
2201 1, // translate just the NUL
2202 NULL, // output buffer
2203 0, // and its size
2204 NULL, // no replacement char
2205 NULL // [out] don't care if it was used
2206 );
2207
2208 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2209 switch ( len )
2210 {
2211 default:
2212 wxLogDebug(_T("Unexpected NUL length %d"), len);
2213 self->m_minMBCharWidth = (size_t)-1;
2214 break;
2215
2216 case 0:
2217 self->m_minMBCharWidth = (size_t)-1;
2218 break;
2219
2220 case 1:
2221 case 2:
2222 case 4:
2223 self->m_minMBCharWidth = len;
2224 break;
2225 }
2226 }
2227
2228 return m_minMBCharWidth;
2229 }
2230
2231 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2232
2233 bool IsOk() const { return m_CodePage != -1; }
2234
2235 private:
2236 static bool CanUseNoBestFit()
2237 {
2238 static int s_isWin98Or2k = -1;
2239
2240 if ( s_isWin98Or2k == -1 )
2241 {
2242 int verMaj, verMin;
2243 switch ( wxGetOsVersion(&verMaj, &verMin) )
2244 {
2245 case wxOS_WINDOWS_9X:
2246 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2247 break;
2248
2249 case wxOS_WINDOWS_NT:
2250 s_isWin98Or2k = verMaj >= 5;
2251 break;
2252
2253 default:
2254 // unknown: be conservative by default
2255 s_isWin98Or2k = 0;
2256 break;
2257 }
2258
2259 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2260 }
2261
2262 return s_isWin98Or2k == 1;
2263 }
2264
2265 static bool IsAtLeastWin2kSP4()
2266 {
2267 #ifdef __WXWINCE__
2268 return false;
2269 #else
2270 static int s_isAtLeastWin2kSP4 = -1;
2271
2272 if ( s_isAtLeastWin2kSP4 == -1 )
2273 {
2274 OSVERSIONINFOEX ver;
2275
2276 memset(&ver, 0, sizeof(ver));
2277 ver.dwOSVersionInfoSize = sizeof(ver);
2278 GetVersionEx((OSVERSIONINFO*)&ver);
2279
2280 s_isAtLeastWin2kSP4 =
2281 ((ver.dwMajorVersion > 5) || // Vista+
2282 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2283 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2284 ver.wServicePackMajor >= 4)) // 2000 SP4+
2285 ? 1 : 0;
2286 }
2287
2288 return s_isAtLeastWin2kSP4 == 1;
2289 #endif
2290 }
2291
2292
2293 // the code page we're working with
2294 long m_CodePage;
2295
2296 // cached result of GetMBNulLen(), set to 0 initially meaning
2297 // "unknown"
2298 size_t m_minMBCharWidth;
2299 };
2300
2301 #endif // wxHAVE_WIN32_MB2WC
2302
2303
2304 // ============================================================================
2305 // Mac conversion classes
2306 // ============================================================================
2307
2308 /* Although we are in the base library we currently have this wxMac
2309 * conditional. This is not generally good but fortunately does not affect
2310 * the ABI of the base library, only what encodings might work.
2311 * It does mean that a wxBase built as part of wxMac has slightly more support
2312 * than one built for wxCocoa or even wxGtk.
2313 */
2314 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2315
2316 class wxMBConv_mac : public wxMBConv
2317 {
2318 public:
2319 wxMBConv_mac()
2320 {
2321 Init(CFStringGetSystemEncoding()) ;
2322 }
2323
2324 wxMBConv_mac(const wxMBConv_mac& conv)
2325 {
2326 Init(conv.m_char_encoding);
2327 }
2328
2329 #if wxUSE_FONTMAP
2330 wxMBConv_mac(const char* name)
2331 {
2332 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2333 }
2334 #endif
2335
2336 wxMBConv_mac(wxFontEncoding encoding)
2337 {
2338 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2339 }
2340
2341 virtual ~wxMBConv_mac()
2342 {
2343 OSStatus status = noErr ;
2344 if (m_MB2WC_converter)
2345 status = TECDisposeConverter(m_MB2WC_converter);
2346 if (m_WC2MB_converter)
2347 status = TECDisposeConverter(m_WC2MB_converter);
2348 }
2349
2350 void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2351 TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2352 {
2353 m_MB2WC_converter = NULL ;
2354 m_WC2MB_converter = NULL ;
2355 m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2356 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2357 }
2358
2359 virtual void CreateIfNeeded() const
2360 {
2361 if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2362 {
2363 OSStatus status = noErr ;
2364 status = TECCreateConverter(&m_MB2WC_converter,
2365 m_char_encoding,
2366 m_unicode_encoding);
2367 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2368 status = TECCreateConverter(&m_WC2MB_converter,
2369 m_unicode_encoding,
2370 m_char_encoding);
2371 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2372 }
2373 }
2374
2375 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2376 {
2377 CreateIfNeeded() ;
2378 OSStatus status = noErr ;
2379 ByteCount byteOutLen ;
2380 ByteCount byteInLen = strlen(psz) + 1;
2381 wchar_t *tbuf = NULL ;
2382 UniChar* ubuf = NULL ;
2383 size_t res = 0 ;
2384
2385 if (buf == NULL)
2386 {
2387 // Apple specs say at least 32
2388 n = wxMax( 32, byteInLen ) ;
2389 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2390 }
2391
2392 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2393
2394 #if SIZEOF_WCHAR_T == 4
2395 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2396 #else
2397 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2398 #endif
2399
2400 status = TECConvertText(
2401 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2402 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2403
2404 #if SIZEOF_WCHAR_T == 4
2405 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2406 // is not properly terminated we get random characters at the end
2407 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2408 wxMBConvUTF16 converter ;
2409 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2410 free( ubuf ) ;
2411 #else
2412 res = byteOutLen / sizeof( UniChar ) ;
2413 #endif
2414
2415 if ( buf == NULL )
2416 free(tbuf) ;
2417
2418 if ( buf && res < n)
2419 buf[res] = 0;
2420
2421 return res ;
2422 }
2423
2424 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2425 {
2426 CreateIfNeeded() ;
2427 OSStatus status = noErr ;
2428 ByteCount byteOutLen ;
2429 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2430
2431 char *tbuf = NULL ;
2432
2433 if (buf == NULL)
2434 {
2435 // Apple specs say at least 32
2436 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2437 tbuf = (char*) malloc( n ) ;
2438 }
2439
2440 ByteCount byteBufferLen = n ;
2441 UniChar* ubuf = NULL ;
2442
2443 #if SIZEOF_WCHAR_T == 4
2444 wxMBConvUTF16 converter ;
2445 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2446 byteInLen = unicharlen ;
2447 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2448 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2449 #else
2450 ubuf = (UniChar*) psz ;
2451 #endif
2452
2453 status = TECConvertText(
2454 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2455 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2456
2457 #if SIZEOF_WCHAR_T == 4
2458 free( ubuf ) ;
2459 #endif
2460
2461 if ( buf == NULL )
2462 free(tbuf) ;
2463
2464 size_t res = byteOutLen ;
2465 if ( buf && res < n)
2466 {
2467 buf[res] = 0;
2468
2469 //we need to double-trip to verify it didn't insert any ? in place
2470 //of bogus characters
2471 wxWCharBuffer wcBuf(n);
2472 size_t pszlen = wxWcslen(psz);
2473 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2474 wxWcslen(wcBuf) != pszlen ||
2475 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2476 {
2477 // we didn't obtain the same thing we started from, hence
2478 // the conversion was lossy and we consider that it failed
2479 return wxCONV_FAILED;
2480 }
2481 }
2482
2483 return res ;
2484 }
2485
2486 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2487
2488 bool IsOk() const
2489 {
2490 CreateIfNeeded() ;
2491 return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2492 }
2493
2494 protected :
2495 mutable TECObjectRef m_MB2WC_converter;
2496 mutable TECObjectRef m_WC2MB_converter;
2497
2498 TextEncodingBase m_char_encoding;
2499 TextEncodingBase m_unicode_encoding;
2500 };
2501
2502 // MB is decomposed (D) normalized UTF8
2503
2504 class wxMBConv_macUTF8D : public wxMBConv_mac
2505 {
2506 public :
2507 wxMBConv_macUTF8D()
2508 {
2509 Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
2510 m_uni = NULL;
2511 m_uniBack = NULL ;
2512 }
2513
2514 virtual ~wxMBConv_macUTF8D()
2515 {
2516 if (m_uni!=NULL)
2517 DisposeUnicodeToTextInfo(&m_uni);
2518 if (m_uniBack!=NULL)
2519 DisposeUnicodeToTextInfo(&m_uniBack);
2520 }
2521
2522 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2523 {
2524 CreateIfNeeded() ;
2525 OSStatus status = noErr ;
2526 ByteCount byteOutLen ;
2527 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2528
2529 char *tbuf = NULL ;
2530
2531 if (buf == NULL)
2532 {
2533 // Apple specs say at least 32
2534 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2535 tbuf = (char*) malloc( n ) ;
2536 }
2537
2538 ByteCount byteBufferLen = n ;
2539 UniChar* ubuf = NULL ;
2540
2541 #if SIZEOF_WCHAR_T == 4
2542 wxMBConvUTF16 converter ;
2543 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2544 byteInLen = unicharlen ;
2545 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2546 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2547 #else
2548 ubuf = (UniChar*) psz ;
2549 #endif
2550
2551 // ubuf is a non-decomposed UniChar buffer
2552
2553 ByteCount dcubuflen = byteInLen * 2 + 2 ;
2554 ByteCount dcubufread , dcubufwritten ;
2555 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2556
2557 ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
2558 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , dcubuf ) ;
2559
2560 // we now convert that decomposed buffer into UTF8
2561
2562 status = TECConvertText(
2563 m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
2564 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2565
2566 free( dcubuf );
2567
2568 #if SIZEOF_WCHAR_T == 4
2569 free( ubuf ) ;
2570 #endif
2571
2572 if ( buf == NULL )
2573 free(tbuf) ;
2574
2575 size_t res = byteOutLen ;
2576 if ( buf && res < n)
2577 {
2578 buf[res] = 0;
2579 // don't test for round-trip fidelity yet, we cannot guarantee it yet
2580 }
2581
2582 return res ;
2583 }
2584
2585 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2586 {
2587 CreateIfNeeded() ;
2588 OSStatus status = noErr ;
2589 ByteCount byteOutLen ;
2590 ByteCount byteInLen = strlen(psz) + 1;
2591 wchar_t *tbuf = NULL ;
2592 UniChar* ubuf = NULL ;
2593 size_t res = 0 ;
2594
2595 if (buf == NULL)
2596 {
2597 // Apple specs say at least 32
2598 n = wxMax( 32, byteInLen ) ;
2599 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2600 }
2601
2602 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2603
2604 #if SIZEOF_WCHAR_T == 4
2605 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2606 #else
2607 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2608 #endif
2609
2610 ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
2611 ByteCount dcubufread , dcubufwritten ;
2612 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2613
2614 status = TECConvertText(
2615 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2616 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
2617 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2618 // is not properly terminated we get random characters at the end
2619 dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2620
2621 // now from the decomposed UniChar to properly composed uniChar
2622 ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
2623 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , ubuf ) ;
2624
2625 free( dcubuf );
2626 byteOutLen = dcubufwritten ;
2627 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2628
2629
2630 #if SIZEOF_WCHAR_T == 4
2631 wxMBConvUTF16 converter ;
2632 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2633 free( ubuf ) ;
2634 #else
2635 res = byteOutLen / sizeof( UniChar ) ;
2636 #endif
2637
2638 if ( buf == NULL )
2639 free(tbuf) ;
2640
2641 if ( buf && res < n)
2642 buf[res] = 0;
2643
2644 return res ;
2645 }
2646
2647 virtual void CreateIfNeeded() const
2648 {
2649 wxMBConv_mac::CreateIfNeeded() ;
2650 if ( m_uni == NULL )
2651 {
2652 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
2653 kUnicodeNoSubset, kTextEncodingDefaultFormat);
2654 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
2655 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
2656 m_map.mappingVersion = kUnicodeUseLatestMapping;
2657
2658 OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
2659 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
2660
2661 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
2662 kUnicodeNoSubset, kTextEncodingDefaultFormat);
2663 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
2664 kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
2665 m_map.mappingVersion = kUnicodeUseLatestMapping;
2666 err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
2667 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
2668 }
2669 }
2670 protected :
2671 mutable UnicodeToTextInfo m_uni;
2672 mutable UnicodeToTextInfo m_uniBack;
2673 mutable UnicodeMapping m_map;
2674 };
2675 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2676
2677 // ============================================================================
2678 // wxEncodingConverter based conversion classes
2679 // ============================================================================
2680
2681 #if wxUSE_FONTMAP
2682
2683 class wxMBConv_wxwin : public wxMBConv
2684 {
2685 private:
2686 void Init()
2687 {
2688 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2689 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2690 }
2691
2692 public:
2693 // temporarily just use wxEncodingConverter stuff,
2694 // so that it works while a better implementation is built
2695 wxMBConv_wxwin(const char* name)
2696 {
2697 if (name)
2698 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2699 else
2700 m_enc = wxFONTENCODING_SYSTEM;
2701
2702 Init();
2703 }
2704
2705 wxMBConv_wxwin(wxFontEncoding enc)
2706 {
2707 m_enc = enc;
2708
2709 Init();
2710 }
2711
2712 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2713 {
2714 size_t inbuf = strlen(psz);
2715 if (buf)
2716 {
2717 if (!m2w.Convert(psz, buf))
2718 return wxCONV_FAILED;
2719 }
2720 return inbuf;
2721 }
2722
2723 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2724 {
2725 const size_t inbuf = wxWcslen(psz);
2726 if (buf)
2727 {
2728 if (!w2m.Convert(psz, buf))
2729 return wxCONV_FAILED;
2730 }
2731
2732 return inbuf;
2733 }
2734
2735 virtual size_t GetMBNulLen() const
2736 {
2737 switch ( m_enc )
2738 {
2739 case wxFONTENCODING_UTF16BE:
2740 case wxFONTENCODING_UTF16LE:
2741 return 2;
2742
2743 case wxFONTENCODING_UTF32BE:
2744 case wxFONTENCODING_UTF32LE:
2745 return 4;
2746
2747 default:
2748 return 1;
2749 }
2750 }
2751
2752 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2753
2754 bool IsOk() const { return m_ok; }
2755
2756 public:
2757 wxFontEncoding m_enc;
2758 wxEncodingConverter m2w, w2m;
2759
2760 private:
2761 // were we initialized successfully?
2762 bool m_ok;
2763
2764 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2765 };
2766
2767 // make the constructors available for unit testing
2768 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2769 {
2770 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2771 if ( !result->IsOk() )
2772 {
2773 delete result;
2774 return 0;
2775 }
2776
2777 return result;
2778 }
2779
2780 #endif // wxUSE_FONTMAP
2781
2782 // ============================================================================
2783 // wxCSConv implementation
2784 // ============================================================================
2785
2786 void wxCSConv::Init()
2787 {
2788 m_name = NULL;
2789 m_convReal = NULL;
2790 m_deferred = true;
2791 }
2792
2793 wxCSConv::wxCSConv(const wxString& charset)
2794 {
2795 Init();
2796
2797 if ( !charset.empty() )
2798 {
2799 SetName(charset.ToAscii());
2800 }
2801
2802 #if wxUSE_FONTMAP
2803 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2804 #else
2805 m_encoding = wxFONTENCODING_SYSTEM;
2806 #endif
2807 }
2808
2809 wxCSConv::wxCSConv(wxFontEncoding encoding)
2810 {
2811 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2812 {
2813 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2814
2815 encoding = wxFONTENCODING_SYSTEM;
2816 }
2817
2818 Init();
2819
2820 m_encoding = encoding;
2821 }
2822
2823 wxCSConv::~wxCSConv()
2824 {
2825 Clear();
2826 }
2827
2828 wxCSConv::wxCSConv(const wxCSConv& conv)
2829 : wxMBConv()
2830 {
2831 Init();
2832
2833 SetName(conv.m_name);
2834 m_encoding = conv.m_encoding;
2835 }
2836
2837 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2838 {
2839 Clear();
2840
2841 SetName(conv.m_name);
2842 m_encoding = conv.m_encoding;
2843
2844 return *this;
2845 }
2846
2847 void wxCSConv::Clear()
2848 {
2849 free(m_name);
2850 delete m_convReal;
2851
2852 m_name = NULL;
2853 m_convReal = NULL;
2854 }
2855
2856 void wxCSConv::SetName(const char *charset)
2857 {
2858 if (charset)
2859 {
2860 m_name = strdup(charset);
2861 m_deferred = true;
2862 }
2863 }
2864
2865 #if wxUSE_FONTMAP
2866
2867 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2868 wxEncodingNameCache );
2869
2870 static wxEncodingNameCache gs_nameCache;
2871 #endif
2872
2873 wxMBConv *wxCSConv::DoCreate() const
2874 {
2875 #if wxUSE_FONTMAP
2876 wxLogTrace(TRACE_STRCONV,
2877 wxT("creating conversion for %s"),
2878 (m_name ? m_name
2879 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2880 #endif // wxUSE_FONTMAP
2881
2882 // check for the special case of ASCII or ISO8859-1 charset: as we have
2883 // special knowledge of it anyhow, we don't need to create a special
2884 // conversion object
2885 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2886 m_encoding == wxFONTENCODING_DEFAULT )
2887 {
2888 // don't convert at all
2889 return NULL;
2890 }
2891
2892 // we trust OS to do conversion better than we can so try external
2893 // conversion methods first
2894 //
2895 // the full order is:
2896 // 1. OS conversion (iconv() under Unix or Win32 API)
2897 // 2. hard coded conversions for UTF
2898 // 3. wxEncodingConverter as fall back
2899
2900 // step (1)
2901 #ifdef HAVE_ICONV
2902 #if !wxUSE_FONTMAP
2903 if ( m_name )
2904 #endif // !wxUSE_FONTMAP
2905 {
2906 #if wxUSE_FONTMAP
2907 wxFontEncoding encoding(m_encoding);
2908 #endif
2909
2910 if ( m_name )
2911 {
2912 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2913 if ( conv->IsOk() )
2914 return conv;
2915
2916 delete conv;
2917
2918 #if wxUSE_FONTMAP
2919 encoding =
2920 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2921 #endif // wxUSE_FONTMAP
2922 }
2923 #if wxUSE_FONTMAP
2924 {
2925 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2926 if ( it != gs_nameCache.end() )
2927 {
2928 if ( it->second.empty() )
2929 return NULL;
2930
2931 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2932 if ( conv->IsOk() )
2933 return conv;
2934
2935 delete conv;
2936 }
2937
2938 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2939 // CS : in case this does not return valid names (eg for MacRoman)
2940 // encoding got a 'failure' entry in the cache all the same,
2941 // although it just has to be created using a different method, so
2942 // only store failed iconv creation attempts (or perhaps we
2943 // shoulnd't do this at all ?)
2944 if ( names[0] != NULL )
2945 {
2946 for ( ; *names; ++names )
2947 {
2948 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2949 // will need changes that will obsolete this
2950 wxString name(*names);
2951 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2952 if ( conv->IsOk() )
2953 {
2954 gs_nameCache[encoding] = *names;
2955 return conv;
2956 }
2957
2958 delete conv;
2959 }
2960
2961 gs_nameCache[encoding] = _T(""); // cache the failure
2962 }
2963 }
2964 #endif // wxUSE_FONTMAP
2965 }
2966 #endif // HAVE_ICONV
2967
2968 #ifdef wxHAVE_WIN32_MB2WC
2969 {
2970 #if wxUSE_FONTMAP
2971 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2972 : new wxMBConv_win32(m_encoding);
2973 if ( conv->IsOk() )
2974 return conv;
2975
2976 delete conv;
2977 #else
2978 return NULL;
2979 #endif
2980 }
2981 #endif // wxHAVE_WIN32_MB2WC
2982
2983 #if defined(__WXMAC__)
2984 {
2985 // leave UTF16 and UTF32 to the built-ins of wx
2986 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2987 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2988 {
2989 #if wxUSE_FONTMAP
2990 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2991 : new wxMBConv_mac(m_encoding);
2992 #else
2993 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2994 #endif
2995 if ( conv->IsOk() )
2996 return conv;
2997
2998 delete conv;
2999 }
3000 }
3001 #endif
3002
3003 #ifdef __DARWIN__
3004 {
3005 // leave UTF16 and UTF32 to the built-ins of wx
3006 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3007 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3008 {
3009 #if wxUSE_FONTMAP
3010 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3011 : new wxMBConv_cf(m_encoding);
3012 #else
3013 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3014 #endif
3015
3016 if ( conv->IsOk() )
3017 return conv;
3018
3019 delete conv;
3020 }
3021 }
3022 #endif // __DARWIN__
3023
3024 // step (2)
3025 wxFontEncoding enc = m_encoding;
3026 #if wxUSE_FONTMAP
3027 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3028 {
3029 // use "false" to suppress interactive dialogs -- we can be called from
3030 // anywhere and popping up a dialog from here is the last thing we want to
3031 // do
3032 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3033 }
3034 #endif // wxUSE_FONTMAP
3035
3036 switch ( enc )
3037 {
3038 case wxFONTENCODING_UTF7:
3039 return new wxMBConvUTF7;
3040
3041 case wxFONTENCODING_UTF8:
3042 return new wxMBConvUTF8;
3043
3044 case wxFONTENCODING_UTF16BE:
3045 return new wxMBConvUTF16BE;
3046
3047 case wxFONTENCODING_UTF16LE:
3048 return new wxMBConvUTF16LE;
3049
3050 case wxFONTENCODING_UTF32BE:
3051 return new wxMBConvUTF32BE;
3052
3053 case wxFONTENCODING_UTF32LE:
3054 return new wxMBConvUTF32LE;
3055
3056 default:
3057 // nothing to do but put here to suppress gcc warnings
3058 break;
3059 }
3060
3061 // step (3)
3062 #if wxUSE_FONTMAP
3063 {
3064 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3065 : new wxMBConv_wxwin(m_encoding);
3066 if ( conv->IsOk() )
3067 return conv;
3068
3069 delete conv;
3070 }
3071 #endif // wxUSE_FONTMAP
3072
3073 // NB: This is a hack to prevent deadlock. What could otherwise happen
3074 // in Unicode build: wxConvLocal creation ends up being here
3075 // because of some failure and logs the error. But wxLog will try to
3076 // attach a timestamp, for which it will need wxConvLocal (to convert
3077 // time to char* and then wchar_t*), but that fails, tries to log the
3078 // error, but wxLog has an (already locked) critical section that
3079 // guards the static buffer.
3080 static bool alreadyLoggingError = false;
3081 if (!alreadyLoggingError)
3082 {
3083 alreadyLoggingError = true;
3084 wxLogError(_("Cannot convert from the charset '%s'!"),
3085 m_name ? m_name
3086 :
3087 #if wxUSE_FONTMAP
3088 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3089 #else // !wxUSE_FONTMAP
3090 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3091 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3092 );
3093
3094 alreadyLoggingError = false;
3095 }
3096
3097 return NULL;
3098 }
3099
3100 void wxCSConv::CreateConvIfNeeded() const
3101 {
3102 if ( m_deferred )
3103 {
3104 wxCSConv *self = (wxCSConv *)this; // const_cast
3105
3106 // if we don't have neither the name nor the encoding, use the default
3107 // encoding for this system
3108 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3109 {
3110 #if wxUSE_INTL
3111 self->m_encoding = wxLocale::GetSystemEncoding();
3112 #else
3113 // fallback to some reasonable default:
3114 self->m_encoding = wxFONTENCODING_ISO8859_1;
3115 #endif // wxUSE_INTL
3116 }
3117
3118 self->m_convReal = DoCreate();
3119 self->m_deferred = false;
3120 }
3121 }
3122
3123 bool wxCSConv::IsOk() const
3124 {
3125 CreateConvIfNeeded();
3126
3127 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3128 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3129 return true; // always ok as we do it ourselves
3130
3131 // m_convReal->IsOk() is called at its own creation, so we know it must
3132 // be ok if m_convReal is non-NULL
3133 return m_convReal != NULL;
3134 }
3135
3136 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3137 const char *src, size_t srcLen) const
3138 {
3139 CreateConvIfNeeded();
3140
3141 if (m_convReal)
3142 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3143
3144 // latin-1 (direct)
3145 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3146 }
3147
3148 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3149 const wchar_t *src, size_t srcLen) const
3150 {
3151 CreateConvIfNeeded();
3152
3153 if (m_convReal)
3154 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3155
3156 // latin-1 (direct)
3157 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3158 }
3159
3160 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3161 {
3162 CreateConvIfNeeded();
3163
3164 if (m_convReal)
3165 return m_convReal->MB2WC(buf, psz, n);
3166
3167 // latin-1 (direct)
3168 size_t len = strlen(psz);
3169
3170 if (buf)
3171 {
3172 for (size_t c = 0; c <= len; c++)
3173 buf[c] = (unsigned char)(psz[c]);
3174 }
3175
3176 return len;
3177 }
3178
3179 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3180 {
3181 CreateConvIfNeeded();
3182
3183 if (m_convReal)
3184 return m_convReal->WC2MB(buf, psz, n);
3185
3186 // latin-1 (direct)
3187 const size_t len = wxWcslen(psz);
3188 if (buf)
3189 {
3190 for (size_t c = 0; c <= len; c++)
3191 {
3192 if (psz[c] > 0xFF)
3193 return wxCONV_FAILED;
3194
3195 buf[c] = (char)psz[c];
3196 }
3197 }
3198 else
3199 {
3200 for (size_t c = 0; c <= len; c++)
3201 {
3202 if (psz[c] > 0xFF)
3203 return wxCONV_FAILED;
3204 }
3205 }
3206
3207 return len;
3208 }
3209
3210 size_t wxCSConv::GetMBNulLen() const
3211 {
3212 CreateConvIfNeeded();
3213
3214 if ( m_convReal )
3215 {
3216 return m_convReal->GetMBNulLen();
3217 }
3218
3219 // otherwise, we are ISO-8859-1
3220 return 1;
3221 }
3222
3223 #if wxUSE_UNICODE_UTF8
3224 bool wxCSConv::IsUTF8() const
3225 {
3226 CreateConvIfNeeded();
3227
3228 if ( m_convReal )
3229 {
3230 return m_convReal->IsUTF8();
3231 }
3232
3233 // otherwise, we are ISO-8859-1
3234 return false;
3235 }
3236 #endif
3237
3238
3239 #if wxUSE_UNICODE
3240
3241 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3242 {
3243 if ( !s )
3244 return wxWCharBuffer();
3245
3246 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3247 if ( !wbuf )
3248 wbuf = wxMBConvUTF8().cMB2WX(s);
3249 if ( !wbuf )
3250 wbuf = wxConvISO8859_1.cMB2WX(s);
3251
3252 return wbuf;
3253 }
3254
3255 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3256 {
3257 if ( !ws )
3258 return wxCharBuffer();
3259
3260 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3261 if ( !buf )
3262 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3263
3264 return buf;
3265 }
3266
3267 #endif // wxUSE_UNICODE
3268
3269 // ----------------------------------------------------------------------------
3270 // globals
3271 // ----------------------------------------------------------------------------
3272
3273 // NB: The reason why we create converted objects in this convoluted way,
3274 // using a factory function instead of global variable, is that they
3275 // may be used at static initialization time (some of them are used by
3276 // wxString ctors and there may be a global wxString object). In other
3277 // words, possibly _before_ the converter global object would be
3278 // initialized.
3279
3280 #undef wxConvLibc
3281 #undef wxConvUTF8
3282 #undef wxConvUTF7
3283 #undef wxConvLocal
3284 #undef wxConvISO8859_1
3285
3286 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3287 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3288 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3289 { \
3290 static impl_klass name##Obj ctor_args; \
3291 return &name##Obj; \
3292 } \
3293 /* this ensures that all global converter objects are created */ \
3294 /* by the time static initialization is done, i.e. before any */ \
3295 /* thread is launched: */ \
3296 static klass* gs_##name##instance = wxGet_##name##Ptr()
3297
3298 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3299 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3300
3301 #ifdef __WINDOWS__
3302 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3303 #elif defined(__WXMAC__) && !defined(__MACH__)
3304 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_mac, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3305 #else
3306 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3307 #endif
3308
3309 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3310 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3311
3312 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3313 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3314
3315 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3316 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3317
3318 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3319 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3320 #endif
3321 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3322 #ifdef __WXOSX__
3323 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3324 &wxConvMacUTF8DObj;
3325 #else
3326 wxGet_wxConvUTF8Ptr();
3327 #endif
3328 #else // !__WXOSX__
3329 wxGet_wxConvLibcPtr();
3330 #endif // __WXOSX__/!__WXOSX__
3331
3332 #else // !wxUSE_WCHAR_T
3333
3334 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3335 // stand-ins in absence of wchar_t
3336 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3337 wxConvISO8859_1,
3338 wxConvLocal,
3339 wxConvUTF8;
3340
3341 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T