Fix wxMBConv_cf to implement FromWChar/ToWChar in lieu of now deprecated WC2MB/MB2WC.
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef __SALFORDC__
48 #include <clib.h>
49 #endif
50
51 #ifdef HAVE_ICONV
52 #include <iconv.h>
53 #include "wx/thread.h"
54 #endif
55
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
58
59 #ifdef __DARWIN__
60 #include <CoreFoundation/CFString.h>
61 #include <CoreFoundation/CFStringEncodingExt.h>
62
63 #include "wx/mac/corefoundation/cfref.h"
64 #endif //def __DARWIN__
65
66 #ifdef __WXMAC__
67 #ifndef __DARWIN__
68 #include <ATSUnicode.h>
69 #include <TextCommon.h>
70 #include <TextEncodingConverter.h>
71 #endif
72
73 // includes Mac headers
74 #include "wx/mac/private.h"
75 #endif
76
77
78 #define TRACE_STRCONV _T("strconv")
79
80 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
81 // be 4 bytes
82 #if SIZEOF_WCHAR_T == 2
83 #define WC_UTF16
84 #endif
85
86
87 // ============================================================================
88 // implementation
89 // ============================================================================
90
91 // helper function of cMB2WC(): check if n bytes at this location are all NUL
92 static bool NotAllNULs(const char *p, size_t n)
93 {
94 while ( n && *p++ == '\0' )
95 n--;
96
97 return n != 0;
98 }
99
100 // ----------------------------------------------------------------------------
101 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
102 // ----------------------------------------------------------------------------
103
104 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
105 {
106 if (input <= 0xffff)
107 {
108 if (output)
109 *output = (wxUint16) input;
110
111 return 1;
112 }
113 else if (input >= 0x110000)
114 {
115 return wxCONV_FAILED;
116 }
117 else
118 {
119 if (output)
120 {
121 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
122 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
123 }
124
125 return 2;
126 }
127 }
128
129 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
130 {
131 if ((*input < 0xd800) || (*input > 0xdfff))
132 {
133 output = *input;
134 return 1;
135 }
136 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
137 {
138 output = *input;
139 return wxCONV_FAILED;
140 }
141 else
142 {
143 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
144 return 2;
145 }
146 }
147
148 #ifdef WC_UTF16
149 typedef wchar_t wxDecodeSurrogate_t;
150 #else // !WC_UTF16
151 typedef wxUint16 wxDecodeSurrogate_t;
152 #endif // WC_UTF16/!WC_UTF16
153
154 // returns the next UTF-32 character from the wchar_t buffer and advances the
155 // pointer to the character after this one
156 //
157 // if an invalid character is found, *pSrc is set to NULL, the caller must
158 // check for this
159 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
160 {
161 wxUint32 out;
162 const size_t
163 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
164 if ( n == wxCONV_FAILED )
165 *pSrc = NULL;
166 else
167 *pSrc += n;
168
169 return out;
170 }
171
172 // ----------------------------------------------------------------------------
173 // wxMBConv
174 // ----------------------------------------------------------------------------
175
176 size_t
177 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
178 const char *src, size_t srcLen) const
179 {
180 // although new conversion classes are supposed to implement this function
181 // directly, the existins ones only implement the old MB2WC() and so, to
182 // avoid to have to rewrite all conversion classes at once, we provide a
183 // default (but not efficient) implementation of this one in terms of the
184 // old function by copying the input to ensure that it's NUL-terminated and
185 // then using MB2WC() to convert it
186
187 // the number of chars [which would be] written to dst [if it were not NULL]
188 size_t dstWritten = 0;
189
190 // the number of NULs terminating this string
191 size_t nulLen = 0; // not really needed, but just to avoid warnings
192
193 // if we were not given the input size we just have to assume that the
194 // string is properly terminated as we have no way of knowing how long it
195 // is anyhow, but if we do have the size check whether there are enough
196 // NULs at the end
197 wxCharBuffer bufTmp;
198 const char *srcEnd;
199 if ( srcLen != wxNO_LEN )
200 {
201 // we need to know how to find the end of this string
202 nulLen = GetMBNulLen();
203 if ( nulLen == wxCONV_FAILED )
204 return wxCONV_FAILED;
205
206 // if there are enough NULs we can avoid the copy
207 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
208 {
209 // make a copy in order to properly NUL-terminate the string
210 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
211 char * const p = bufTmp.data();
212 memcpy(p, src, srcLen);
213 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
214 *s = '\0';
215
216 src = bufTmp;
217 }
218
219 srcEnd = src + srcLen;
220 }
221 else // quit after the first loop iteration
222 {
223 srcEnd = NULL;
224 }
225
226 for ( ;; )
227 {
228 // try to convert the current chunk
229 size_t lenChunk = MB2WC(NULL, src, 0);
230 if ( lenChunk == wxCONV_FAILED )
231 return wxCONV_FAILED;
232
233 lenChunk++; // for the L'\0' at the end of this chunk
234
235 dstWritten += lenChunk;
236
237 if ( lenChunk == 1 )
238 {
239 // nothing left in the input string, conversion succeeded
240 break;
241 }
242
243 if ( dst )
244 {
245 if ( dstWritten > dstLen )
246 return wxCONV_FAILED;
247
248 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
249 return wxCONV_FAILED;
250
251 dst += lenChunk;
252 }
253
254 if ( !srcEnd )
255 {
256 // we convert just one chunk in this case as this is the entire
257 // string anyhow
258 break;
259 }
260
261 // advance the input pointer past the end of this chunk
262 while ( NotAllNULs(src, nulLen) )
263 {
264 // notice that we must skip over multiple bytes here as we suppose
265 // that if NUL takes 2 or 4 bytes, then all the other characters do
266 // too and so if advanced by a single byte we might erroneously
267 // detect sequences of NUL bytes in the middle of the input
268 src += nulLen;
269 }
270
271 src += nulLen; // skipping over its terminator as well
272
273 // note that ">=" (and not just "==") is needed here as the terminator
274 // we skipped just above could be inside or just after the buffer
275 // delimited by inEnd
276 if ( src >= srcEnd )
277 break;
278 }
279
280 return dstWritten;
281 }
282
283 size_t
284 wxMBConv::FromWChar(char *dst, size_t dstLen,
285 const wchar_t *src, size_t srcLen) const
286 {
287 // the number of chars [which would be] written to dst [if it were not NULL]
288 size_t dstWritten = 0;
289
290 // make a copy of the input string unless it is already properly
291 // NUL-terminated
292 //
293 // if we don't know its length we have no choice but to assume that it is,
294 // indeed, properly terminated
295 wxWCharBuffer bufTmp;
296 if ( srcLen == wxNO_LEN )
297 {
298 srcLen = wxWcslen(src) + 1;
299 }
300 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
301 {
302 // make a copy in order to properly NUL-terminate the string
303 bufTmp = wxWCharBuffer(srcLen);
304 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
305 src = bufTmp;
306 }
307
308 const size_t lenNul = GetMBNulLen();
309 for ( const wchar_t * const srcEnd = src + srcLen;
310 src < srcEnd;
311 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
312 {
313 // try to convert the current chunk
314 size_t lenChunk = WC2MB(NULL, src, 0);
315
316 if ( lenChunk == wxCONV_FAILED )
317 return wxCONV_FAILED;
318
319 lenChunk += lenNul;
320 dstWritten += lenChunk;
321
322 if ( dst )
323 {
324 if ( dstWritten > dstLen )
325 return wxCONV_FAILED;
326
327 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
328 return wxCONV_FAILED;
329
330 dst += lenChunk;
331 }
332 }
333
334 return dstWritten;
335 }
336
337 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
338 {
339 size_t rc = ToWChar(outBuff, outLen, inBuff);
340 if ( rc != wxCONV_FAILED )
341 {
342 // ToWChar() returns the buffer length, i.e. including the trailing
343 // NUL, while this method doesn't take it into account
344 rc--;
345 }
346
347 return rc;
348 }
349
350 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
351 {
352 size_t rc = FromWChar(outBuff, outLen, inBuff);
353 if ( rc != wxCONV_FAILED )
354 {
355 rc -= GetMBNulLen();
356 }
357
358 return rc;
359 }
360
361 wxMBConv::~wxMBConv()
362 {
363 // nothing to do here (necessary for Darwin linking probably)
364 }
365
366 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
367 {
368 if ( psz )
369 {
370 // calculate the length of the buffer needed first
371 const size_t nLen = MB2WC(NULL, psz, 0);
372 if ( nLen != wxCONV_FAILED )
373 {
374 // now do the actual conversion
375 wxWCharBuffer buf(nLen /* +1 added implicitly */);
376
377 // +1 for the trailing NULL
378 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
379 return buf;
380 }
381 }
382
383 return wxWCharBuffer();
384 }
385
386 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
387 {
388 if ( pwz )
389 {
390 const size_t nLen = WC2MB(NULL, pwz, 0);
391 if ( nLen != wxCONV_FAILED )
392 {
393 // extra space for trailing NUL(s)
394 static const size_t extraLen = GetMaxMBNulLen();
395
396 wxCharBuffer buf(nLen + extraLen - 1);
397 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
398 return buf;
399 }
400 }
401
402 return wxCharBuffer();
403 }
404
405 const wxWCharBuffer
406 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
407 {
408 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
409 if ( dstLen != wxCONV_FAILED )
410 {
411 wxWCharBuffer wbuf(dstLen - 1);
412 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
413 {
414 if ( outLen )
415 {
416 *outLen = dstLen;
417 if ( wbuf[dstLen - 1] == L'\0' )
418 (*outLen)--;
419 }
420
421 return wbuf;
422 }
423 }
424
425 if ( outLen )
426 *outLen = 0;
427
428 return wxWCharBuffer();
429 }
430
431 const wxCharBuffer
432 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
433 {
434 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
435 if ( dstLen != wxCONV_FAILED )
436 {
437 // special case of empty input: can't allocate 0 size buffer below as
438 // wxCharBuffer insists on NUL-terminating it
439 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
440 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
441 {
442 if ( outLen )
443 {
444 *outLen = dstLen;
445
446 const size_t nulLen = GetMBNulLen();
447 if ( dstLen >= nulLen &&
448 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
449 {
450 // in this case the output is NUL-terminated and we're not
451 // supposed to count NUL
452 *outLen -= nulLen;
453 }
454 }
455
456 return buf;
457 }
458 }
459
460 if ( outLen )
461 *outLen = 0;
462
463 return wxCharBuffer();
464 }
465
466 // ----------------------------------------------------------------------------
467 // wxMBConvLibc
468 // ----------------------------------------------------------------------------
469
470 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
471 {
472 return wxMB2WC(buf, psz, n);
473 }
474
475 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
476 {
477 return wxWC2MB(buf, psz, n);
478 }
479
480 // ----------------------------------------------------------------------------
481 // wxConvBrokenFileNames
482 // ----------------------------------------------------------------------------
483
484 #ifdef __UNIX__
485
486 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
487 {
488 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
489 wxStricmp(charset, _T("UTF8")) == 0 )
490 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
491 else
492 m_conv = new wxCSConv(charset);
493 }
494
495 #endif // __UNIX__
496
497 // ----------------------------------------------------------------------------
498 // UTF-7
499 // ----------------------------------------------------------------------------
500
501 // Implementation (C) 2004 Fredrik Roubert
502
503 //
504 // BASE64 decoding table
505 //
506 static const unsigned char utf7unb64[] =
507 {
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
514 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
515 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
517 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
518 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
519 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
521 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
522 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
523 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
536 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
540 };
541
542 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
543 {
544 size_t len = 0;
545
546 while ( *psz && (!buf || (len < n)) )
547 {
548 unsigned char cc = *psz++;
549 if (cc != '+')
550 {
551 // plain ASCII char
552 if (buf)
553 *buf++ = cc;
554 len++;
555 }
556 else if (*psz == '-')
557 {
558 // encoded plus sign
559 if (buf)
560 *buf++ = cc;
561 len++;
562 psz++;
563 }
564 else // start of BASE64 encoded string
565 {
566 bool lsb, ok;
567 unsigned int d, l;
568 for ( ok = lsb = false, d = 0, l = 0;
569 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
570 psz++ )
571 {
572 d <<= 6;
573 d += cc;
574 for (l += 6; l >= 8; lsb = !lsb)
575 {
576 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
577 if (lsb)
578 {
579 if (buf)
580 *buf++ |= c;
581 len ++;
582 }
583 else
584 {
585 if (buf)
586 *buf = (wchar_t)(c << 8);
587 }
588
589 ok = true;
590 }
591 }
592
593 if ( !ok )
594 {
595 // in valid UTF7 we should have valid characters after '+'
596 return wxCONV_FAILED;
597 }
598
599 if (*psz == '-')
600 psz++;
601 }
602 }
603
604 if ( buf && (len < n) )
605 *buf = '\0';
606
607 return len;
608 }
609
610 //
611 // BASE64 encoding table
612 //
613 static const unsigned char utf7enb64[] =
614 {
615 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
616 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
617 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
618 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
619 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
620 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
621 'w', 'x', 'y', 'z', '0', '1', '2', '3',
622 '4', '5', '6', '7', '8', '9', '+', '/'
623 };
624
625 //
626 // UTF-7 encoding table
627 //
628 // 0 - Set D (directly encoded characters)
629 // 1 - Set O (optional direct characters)
630 // 2 - whitespace characters (optional)
631 // 3 - special characters
632 //
633 static const unsigned char utf7encode[128] =
634 {
635 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
636 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
637 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
639 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
641 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
642 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
643 };
644
645 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
646 {
647 size_t len = 0;
648
649 while (*psz && ((!buf) || (len < n)))
650 {
651 wchar_t cc = *psz++;
652 if (cc < 0x80 && utf7encode[cc] < 1)
653 {
654 // plain ASCII char
655 if (buf)
656 *buf++ = (char)cc;
657
658 len++;
659 }
660 #ifndef WC_UTF16
661 else if (((wxUint32)cc) > 0xffff)
662 {
663 // no surrogate pair generation (yet?)
664 return wxCONV_FAILED;
665 }
666 #endif
667 else
668 {
669 if (buf)
670 *buf++ = '+';
671
672 len++;
673 if (cc != '+')
674 {
675 // BASE64 encode string
676 unsigned int lsb, d, l;
677 for (d = 0, l = 0; /*nothing*/; psz++)
678 {
679 for (lsb = 0; lsb < 2; lsb ++)
680 {
681 d <<= 8;
682 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
683
684 for (l += 8; l >= 6; )
685 {
686 l -= 6;
687 if (buf)
688 *buf++ = utf7enb64[(d >> l) % 64];
689 len++;
690 }
691 }
692
693 cc = *psz;
694 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
695 break;
696 }
697
698 if (l != 0)
699 {
700 if (buf)
701 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
702
703 len++;
704 }
705 }
706
707 if (buf)
708 *buf++ = '-';
709 len++;
710 }
711 }
712
713 if (buf && (len < n))
714 *buf = 0;
715
716 return len;
717 }
718
719 // ----------------------------------------------------------------------------
720 // UTF-8
721 // ----------------------------------------------------------------------------
722
723 static wxUint32 utf8_max[]=
724 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
725
726 // boundaries of the private use area we use to (temporarily) remap invalid
727 // characters invalid in a UTF-8 encoded string
728 const wxUint32 wxUnicodePUA = 0x100000;
729 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
730
731 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
732 {
733 size_t len = 0;
734
735 while (*psz && ((!buf) || (len < n)))
736 {
737 const char *opsz = psz;
738 bool invalid = false;
739 unsigned char cc = *psz++, fc = cc;
740 unsigned cnt;
741 for (cnt = 0; fc & 0x80; cnt++)
742 fc <<= 1;
743
744 if (!cnt)
745 {
746 // plain ASCII char
747 if (buf)
748 *buf++ = cc;
749 len++;
750
751 // escape the escape character for octal escapes
752 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
753 && cc == '\\' && (!buf || len < n))
754 {
755 if (buf)
756 *buf++ = cc;
757 len++;
758 }
759 }
760 else
761 {
762 cnt--;
763 if (!cnt)
764 {
765 // invalid UTF-8 sequence
766 invalid = true;
767 }
768 else
769 {
770 unsigned ocnt = cnt - 1;
771 wxUint32 res = cc & (0x3f >> cnt);
772 while (cnt--)
773 {
774 cc = *psz;
775 if ((cc & 0xC0) != 0x80)
776 {
777 // invalid UTF-8 sequence
778 invalid = true;
779 break;
780 }
781
782 psz++;
783 res = (res << 6) | (cc & 0x3f);
784 }
785
786 if (invalid || res <= utf8_max[ocnt])
787 {
788 // illegal UTF-8 encoding
789 invalid = true;
790 }
791 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
792 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
793 {
794 // if one of our PUA characters turns up externally
795 // it must also be treated as an illegal sequence
796 // (a bit like you have to escape an escape character)
797 invalid = true;
798 }
799 else
800 {
801 #ifdef WC_UTF16
802 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
803 size_t pa = encode_utf16(res, (wxUint16 *)buf);
804 if (pa == wxCONV_FAILED)
805 {
806 invalid = true;
807 }
808 else
809 {
810 if (buf)
811 buf += pa;
812 len += pa;
813 }
814 #else // !WC_UTF16
815 if (buf)
816 *buf++ = (wchar_t)res;
817 len++;
818 #endif // WC_UTF16/!WC_UTF16
819 }
820 }
821
822 if (invalid)
823 {
824 if (m_options & MAP_INVALID_UTF8_TO_PUA)
825 {
826 while (opsz < psz && (!buf || len < n))
827 {
828 #ifdef WC_UTF16
829 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
830 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
831 wxASSERT(pa != wxCONV_FAILED);
832 if (buf)
833 buf += pa;
834 opsz++;
835 len += pa;
836 #else
837 if (buf)
838 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
839 opsz++;
840 len++;
841 #endif
842 }
843 }
844 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
845 {
846 while (opsz < psz && (!buf || len < n))
847 {
848 if ( buf && len + 3 < n )
849 {
850 unsigned char on = *opsz;
851 *buf++ = L'\\';
852 *buf++ = (wchar_t)( L'0' + on / 0100 );
853 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
854 *buf++ = (wchar_t)( L'0' + on % 010 );
855 }
856
857 opsz++;
858 len += 4;
859 }
860 }
861 else // MAP_INVALID_UTF8_NOT
862 {
863 return wxCONV_FAILED;
864 }
865 }
866 }
867 }
868
869 if (buf && (len < n))
870 *buf = 0;
871
872 return len;
873 }
874
875 static inline bool isoctal(wchar_t wch)
876 {
877 return L'0' <= wch && wch <= L'7';
878 }
879
880 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
881 {
882 size_t len = 0;
883
884 while (*psz && ((!buf) || (len < n)))
885 {
886 wxUint32 cc;
887
888 #ifdef WC_UTF16
889 // cast is ok for WC_UTF16
890 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
891 psz += (pa == wxCONV_FAILED) ? 1 : pa;
892 #else
893 cc = (*psz++) & 0x7fffffff;
894 #endif
895
896 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
897 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
898 {
899 if (buf)
900 *buf++ = (char)(cc - wxUnicodePUA);
901 len++;
902 }
903 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
904 && cc == L'\\' && psz[0] == L'\\' )
905 {
906 if (buf)
907 *buf++ = (char)cc;
908 psz++;
909 len++;
910 }
911 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
912 cc == L'\\' &&
913 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
914 {
915 if (buf)
916 {
917 *buf++ = (char) ((psz[0] - L'0') * 0100 +
918 (psz[1] - L'0') * 010 +
919 (psz[2] - L'0'));
920 }
921
922 psz += 3;
923 len++;
924 }
925 else
926 {
927 unsigned cnt;
928 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
929 {
930 }
931
932 if (!cnt)
933 {
934 // plain ASCII char
935 if (buf)
936 *buf++ = (char) cc;
937 len++;
938 }
939 else
940 {
941 len += cnt + 1;
942 if (buf)
943 {
944 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
945 while (cnt--)
946 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
947 }
948 }
949 }
950 }
951
952 if (buf && (len < n))
953 *buf = 0;
954
955 return len;
956 }
957
958 // ============================================================================
959 // UTF-16
960 // ============================================================================
961
962 #ifdef WORDS_BIGENDIAN
963 #define wxMBConvUTF16straight wxMBConvUTF16BE
964 #define wxMBConvUTF16swap wxMBConvUTF16LE
965 #else
966 #define wxMBConvUTF16swap wxMBConvUTF16BE
967 #define wxMBConvUTF16straight wxMBConvUTF16LE
968 #endif
969
970 /* static */
971 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
972 {
973 if ( srcLen == wxNO_LEN )
974 {
975 // count the number of bytes in input, including the trailing NULs
976 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
977 for ( srcLen = 1; *inBuff++; srcLen++ )
978 ;
979
980 srcLen *= BYTES_PER_CHAR;
981 }
982 else // we already have the length
983 {
984 // we can only convert an entire number of UTF-16 characters
985 if ( srcLen % BYTES_PER_CHAR )
986 return wxCONV_FAILED;
987 }
988
989 return srcLen;
990 }
991
992 // case when in-memory representation is UTF-16 too
993 #ifdef WC_UTF16
994
995 // ----------------------------------------------------------------------------
996 // conversions without endianness change
997 // ----------------------------------------------------------------------------
998
999 size_t
1000 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1001 const char *src, size_t srcLen) const
1002 {
1003 // set up the scene for using memcpy() (which is presumably more efficient
1004 // than copying the bytes one by one)
1005 srcLen = GetLength(src, srcLen);
1006 if ( srcLen == wxNO_LEN )
1007 return wxCONV_FAILED;
1008
1009 const size_t inLen = srcLen / BYTES_PER_CHAR;
1010 if ( dst )
1011 {
1012 if ( dstLen < inLen )
1013 return wxCONV_FAILED;
1014
1015 memcpy(dst, src, srcLen);
1016 }
1017
1018 return inLen;
1019 }
1020
1021 size_t
1022 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1023 const wchar_t *src, size_t srcLen) const
1024 {
1025 if ( srcLen == wxNO_LEN )
1026 srcLen = wxWcslen(src) + 1;
1027
1028 srcLen *= BYTES_PER_CHAR;
1029
1030 if ( dst )
1031 {
1032 if ( dstLen < srcLen )
1033 return wxCONV_FAILED;
1034
1035 memcpy(dst, src, srcLen);
1036 }
1037
1038 return srcLen;
1039 }
1040
1041 // ----------------------------------------------------------------------------
1042 // endian-reversing conversions
1043 // ----------------------------------------------------------------------------
1044
1045 size_t
1046 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1047 const char *src, size_t srcLen) const
1048 {
1049 srcLen = GetLength(src, srcLen);
1050 if ( srcLen == wxNO_LEN )
1051 return wxCONV_FAILED;
1052
1053 srcLen /= BYTES_PER_CHAR;
1054
1055 if ( dst )
1056 {
1057 if ( dstLen < srcLen )
1058 return wxCONV_FAILED;
1059
1060 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1061 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1062 {
1063 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1064 }
1065 }
1066
1067 return srcLen;
1068 }
1069
1070 size_t
1071 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1072 const wchar_t *src, size_t srcLen) const
1073 {
1074 if ( srcLen == wxNO_LEN )
1075 srcLen = wxWcslen(src) + 1;
1076
1077 srcLen *= BYTES_PER_CHAR;
1078
1079 if ( dst )
1080 {
1081 if ( dstLen < srcLen )
1082 return wxCONV_FAILED;
1083
1084 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1085 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1086 {
1087 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1088 }
1089 }
1090
1091 return srcLen;
1092 }
1093
1094 #else // !WC_UTF16: wchar_t is UTF-32
1095
1096 // ----------------------------------------------------------------------------
1097 // conversions without endianness change
1098 // ----------------------------------------------------------------------------
1099
1100 size_t
1101 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1102 const char *src, size_t srcLen) const
1103 {
1104 srcLen = GetLength(src, srcLen);
1105 if ( srcLen == wxNO_LEN )
1106 return wxCONV_FAILED;
1107
1108 const size_t inLen = srcLen / BYTES_PER_CHAR;
1109 if ( !dst )
1110 {
1111 // optimization: return maximal space which could be needed for this
1112 // string even if the real size could be smaller if the buffer contains
1113 // any surrogates
1114 return inLen;
1115 }
1116
1117 size_t outLen = 0;
1118 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1119 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1120 {
1121 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1122 if ( !inBuff )
1123 return wxCONV_FAILED;
1124
1125 if ( ++outLen > dstLen )
1126 return wxCONV_FAILED;
1127
1128 *dst++ = ch;
1129 }
1130
1131
1132 return outLen;
1133 }
1134
1135 size_t
1136 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1137 const wchar_t *src, size_t srcLen) const
1138 {
1139 if ( srcLen == wxNO_LEN )
1140 srcLen = wxWcslen(src) + 1;
1141
1142 size_t outLen = 0;
1143 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1144 for ( size_t n = 0; n < srcLen; n++ )
1145 {
1146 wxUint16 cc[2];
1147 const size_t numChars = encode_utf16(*src++, cc);
1148 if ( numChars == wxCONV_FAILED )
1149 return wxCONV_FAILED;
1150
1151 outLen += numChars * BYTES_PER_CHAR;
1152 if ( outBuff )
1153 {
1154 if ( outLen > dstLen )
1155 return wxCONV_FAILED;
1156
1157 *outBuff++ = cc[0];
1158 if ( numChars == 2 )
1159 {
1160 // second character of a surrogate
1161 *outBuff++ = cc[1];
1162 }
1163 }
1164 }
1165
1166 return outLen;
1167 }
1168
1169 // ----------------------------------------------------------------------------
1170 // endian-reversing conversions
1171 // ----------------------------------------------------------------------------
1172
1173 size_t
1174 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1175 const char *src, size_t srcLen) const
1176 {
1177 srcLen = GetLength(src, srcLen);
1178 if ( srcLen == wxNO_LEN )
1179 return wxCONV_FAILED;
1180
1181 const size_t inLen = srcLen / BYTES_PER_CHAR;
1182 if ( !dst )
1183 {
1184 // optimization: return maximal space which could be needed for this
1185 // string even if the real size could be smaller if the buffer contains
1186 // any surrogates
1187 return inLen;
1188 }
1189
1190 size_t outLen = 0;
1191 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1192 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1193 {
1194 wxUint32 ch;
1195 wxUint16 tmp[2];
1196
1197 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1198 inBuff++;
1199 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1200
1201 const size_t numChars = decode_utf16(tmp, ch);
1202 if ( numChars == wxCONV_FAILED )
1203 return wxCONV_FAILED;
1204
1205 if ( numChars == 2 )
1206 inBuff++;
1207
1208 if ( ++outLen > dstLen )
1209 return wxCONV_FAILED;
1210
1211 *dst++ = ch;
1212 }
1213
1214
1215 return outLen;
1216 }
1217
1218 size_t
1219 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1220 const wchar_t *src, size_t srcLen) const
1221 {
1222 if ( srcLen == wxNO_LEN )
1223 srcLen = wxWcslen(src) + 1;
1224
1225 size_t outLen = 0;
1226 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1227 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1228 {
1229 wxUint16 cc[2];
1230 const size_t numChars = encode_utf16(*src, cc);
1231 if ( numChars == wxCONV_FAILED )
1232 return wxCONV_FAILED;
1233
1234 outLen += numChars * BYTES_PER_CHAR;
1235 if ( outBuff )
1236 {
1237 if ( outLen > dstLen )
1238 return wxCONV_FAILED;
1239
1240 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1241 if ( numChars == 2 )
1242 {
1243 // second character of a surrogate
1244 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1245 }
1246 }
1247 }
1248
1249 return outLen;
1250 }
1251
1252 #endif // WC_UTF16/!WC_UTF16
1253
1254
1255 // ============================================================================
1256 // UTF-32
1257 // ============================================================================
1258
1259 #ifdef WORDS_BIGENDIAN
1260 #define wxMBConvUTF32straight wxMBConvUTF32BE
1261 #define wxMBConvUTF32swap wxMBConvUTF32LE
1262 #else
1263 #define wxMBConvUTF32swap wxMBConvUTF32BE
1264 #define wxMBConvUTF32straight wxMBConvUTF32LE
1265 #endif
1266
1267
1268 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1269 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1270
1271 /* static */
1272 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1273 {
1274 if ( srcLen == wxNO_LEN )
1275 {
1276 // count the number of bytes in input, including the trailing NULs
1277 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1278 for ( srcLen = 1; *inBuff++; srcLen++ )
1279 ;
1280
1281 srcLen *= BYTES_PER_CHAR;
1282 }
1283 else // we already have the length
1284 {
1285 // we can only convert an entire number of UTF-32 characters
1286 if ( srcLen % BYTES_PER_CHAR )
1287 return wxCONV_FAILED;
1288 }
1289
1290 return srcLen;
1291 }
1292
1293 // case when in-memory representation is UTF-16
1294 #ifdef WC_UTF16
1295
1296 // ----------------------------------------------------------------------------
1297 // conversions without endianness change
1298 // ----------------------------------------------------------------------------
1299
1300 size_t
1301 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1302 const char *src, size_t srcLen) const
1303 {
1304 srcLen = GetLength(src, srcLen);
1305 if ( srcLen == wxNO_LEN )
1306 return wxCONV_FAILED;
1307
1308 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1309 const size_t inLen = srcLen / BYTES_PER_CHAR;
1310 size_t outLen = 0;
1311 for ( size_t n = 0; n < inLen; n++ )
1312 {
1313 wxUint16 cc[2];
1314 const size_t numChars = encode_utf16(*inBuff++, cc);
1315 if ( numChars == wxCONV_FAILED )
1316 return wxCONV_FAILED;
1317
1318 outLen += numChars;
1319 if ( dst )
1320 {
1321 if ( outLen > dstLen )
1322 return wxCONV_FAILED;
1323
1324 *dst++ = cc[0];
1325 if ( numChars == 2 )
1326 {
1327 // second character of a surrogate
1328 *dst++ = cc[1];
1329 }
1330 }
1331 }
1332
1333 return outLen;
1334 }
1335
1336 size_t
1337 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1338 const wchar_t *src, size_t srcLen) const
1339 {
1340 if ( srcLen == wxNO_LEN )
1341 srcLen = wxWcslen(src) + 1;
1342
1343 if ( !dst )
1344 {
1345 // optimization: return maximal space which could be needed for this
1346 // string instead of the exact amount which could be less if there are
1347 // any surrogates in the input
1348 //
1349 // we consider that surrogates are rare enough to make it worthwhile to
1350 // avoid running the loop below at the cost of slightly extra memory
1351 // consumption
1352 return srcLen * BYTES_PER_CHAR;
1353 }
1354
1355 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1356 size_t outLen = 0;
1357 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1358 {
1359 const wxUint32 ch = wxDecodeSurrogate(&src);
1360 if ( !src )
1361 return wxCONV_FAILED;
1362
1363 outLen += BYTES_PER_CHAR;
1364
1365 if ( outLen > dstLen )
1366 return wxCONV_FAILED;
1367
1368 *outBuff++ = ch;
1369 }
1370
1371 return outLen;
1372 }
1373
1374 // ----------------------------------------------------------------------------
1375 // endian-reversing conversions
1376 // ----------------------------------------------------------------------------
1377
1378 size_t
1379 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1380 const char *src, size_t srcLen) const
1381 {
1382 srcLen = GetLength(src, srcLen);
1383 if ( srcLen == wxNO_LEN )
1384 return wxCONV_FAILED;
1385
1386 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1387 const size_t inLen = srcLen / BYTES_PER_CHAR;
1388 size_t outLen = 0;
1389 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1390 {
1391 wxUint16 cc[2];
1392 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1393 if ( numChars == wxCONV_FAILED )
1394 return wxCONV_FAILED;
1395
1396 outLen += numChars;
1397 if ( dst )
1398 {
1399 if ( outLen > dstLen )
1400 return wxCONV_FAILED;
1401
1402 *dst++ = cc[0];
1403 if ( numChars == 2 )
1404 {
1405 // second character of a surrogate
1406 *dst++ = cc[1];
1407 }
1408 }
1409 }
1410
1411 return outLen;
1412 }
1413
1414 size_t
1415 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1416 const wchar_t *src, size_t srcLen) const
1417 {
1418 if ( srcLen == wxNO_LEN )
1419 srcLen = wxWcslen(src) + 1;
1420
1421 if ( !dst )
1422 {
1423 // optimization: return maximal space which could be needed for this
1424 // string instead of the exact amount which could be less if there are
1425 // any surrogates in the input
1426 //
1427 // we consider that surrogates are rare enough to make it worthwhile to
1428 // avoid running the loop below at the cost of slightly extra memory
1429 // consumption
1430 return srcLen*BYTES_PER_CHAR;
1431 }
1432
1433 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1434 size_t outLen = 0;
1435 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1436 {
1437 const wxUint32 ch = wxDecodeSurrogate(&src);
1438 if ( !src )
1439 return wxCONV_FAILED;
1440
1441 outLen += BYTES_PER_CHAR;
1442
1443 if ( outLen > dstLen )
1444 return wxCONV_FAILED;
1445
1446 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1447 }
1448
1449 return outLen;
1450 }
1451
1452 #else // !WC_UTF16: wchar_t is UTF-32
1453
1454 // ----------------------------------------------------------------------------
1455 // conversions without endianness change
1456 // ----------------------------------------------------------------------------
1457
1458 size_t
1459 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1460 const char *src, size_t srcLen) const
1461 {
1462 // use memcpy() as it should be much faster than hand-written loop
1463 srcLen = GetLength(src, srcLen);
1464 if ( srcLen == wxNO_LEN )
1465 return wxCONV_FAILED;
1466
1467 const size_t inLen = srcLen/BYTES_PER_CHAR;
1468 if ( dst )
1469 {
1470 if ( dstLen < inLen )
1471 return wxCONV_FAILED;
1472
1473 memcpy(dst, src, srcLen);
1474 }
1475
1476 return inLen;
1477 }
1478
1479 size_t
1480 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1481 const wchar_t *src, size_t srcLen) const
1482 {
1483 if ( srcLen == wxNO_LEN )
1484 srcLen = wxWcslen(src) + 1;
1485
1486 srcLen *= BYTES_PER_CHAR;
1487
1488 if ( dst )
1489 {
1490 if ( dstLen < srcLen )
1491 return wxCONV_FAILED;
1492
1493 memcpy(dst, src, srcLen);
1494 }
1495
1496 return srcLen;
1497 }
1498
1499 // ----------------------------------------------------------------------------
1500 // endian-reversing conversions
1501 // ----------------------------------------------------------------------------
1502
1503 size_t
1504 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1505 const char *src, size_t srcLen) const
1506 {
1507 srcLen = GetLength(src, srcLen);
1508 if ( srcLen == wxNO_LEN )
1509 return wxCONV_FAILED;
1510
1511 srcLen /= BYTES_PER_CHAR;
1512
1513 if ( dst )
1514 {
1515 if ( dstLen < srcLen )
1516 return wxCONV_FAILED;
1517
1518 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1519 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1520 {
1521 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1522 }
1523 }
1524
1525 return srcLen;
1526 }
1527
1528 size_t
1529 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1530 const wchar_t *src, size_t srcLen) const
1531 {
1532 if ( srcLen == wxNO_LEN )
1533 srcLen = wxWcslen(src) + 1;
1534
1535 srcLen *= BYTES_PER_CHAR;
1536
1537 if ( dst )
1538 {
1539 if ( dstLen < srcLen )
1540 return wxCONV_FAILED;
1541
1542 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1543 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1544 {
1545 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1546 }
1547 }
1548
1549 return srcLen;
1550 }
1551
1552 #endif // WC_UTF16/!WC_UTF16
1553
1554
1555 // ============================================================================
1556 // The classes doing conversion using the iconv_xxx() functions
1557 // ============================================================================
1558
1559 #ifdef HAVE_ICONV
1560
1561 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1562 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1563 // (unless there's yet another bug in glibc) the only case when iconv()
1564 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1565 // left in the input buffer -- when _real_ error occurs,
1566 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1567 // iconv() failure.
1568 // [This bug does not appear in glibc 2.2.]
1569 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1570 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1571 (errno != E2BIG || bufLeft != 0))
1572 #else
1573 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1574 #endif
1575
1576 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1577
1578 #define ICONV_T_INVALID ((iconv_t)-1)
1579
1580 #if SIZEOF_WCHAR_T == 4
1581 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1582 #define WC_ENC wxFONTENCODING_UTF32
1583 #elif SIZEOF_WCHAR_T == 2
1584 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1585 #define WC_ENC wxFONTENCODING_UTF16
1586 #else // sizeof(wchar_t) != 2 nor 4
1587 // does this ever happen?
1588 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1589 #endif
1590
1591 // ----------------------------------------------------------------------------
1592 // wxMBConv_iconv: encapsulates an iconv character set
1593 // ----------------------------------------------------------------------------
1594
1595 class wxMBConv_iconv : public wxMBConv
1596 {
1597 public:
1598 wxMBConv_iconv(const char *name);
1599 virtual ~wxMBConv_iconv();
1600
1601 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1602 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1603
1604 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1605 virtual size_t GetMBNulLen() const;
1606
1607 #if wxUSE_UNICODE_UTF8
1608 virtual bool IsUTF8() const;
1609 #endif
1610
1611 virtual wxMBConv *Clone() const
1612 {
1613 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1614 p->m_minMBCharWidth = m_minMBCharWidth;
1615 return p;
1616 }
1617
1618 bool IsOk() const
1619 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1620
1621 protected:
1622 // the iconv handlers used to translate from multibyte
1623 // to wide char and in the other direction
1624 iconv_t m2w,
1625 w2m;
1626
1627 #if wxUSE_THREADS
1628 // guards access to m2w and w2m objects
1629 wxMutex m_iconvMutex;
1630 #endif
1631
1632 private:
1633 // the name (for iconv_open()) of a wide char charset -- if none is
1634 // available on this machine, it will remain NULL
1635 static wxString ms_wcCharsetName;
1636
1637 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1638 // different endian-ness than the native one
1639 static bool ms_wcNeedsSwap;
1640
1641
1642 // name of the encoding handled by this conversion
1643 wxString m_name;
1644
1645 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1646 // initially
1647 size_t m_minMBCharWidth;
1648 };
1649
1650 // make the constructor available for unit testing
1651 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1652 {
1653 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1654 if ( !result->IsOk() )
1655 {
1656 delete result;
1657 return 0;
1658 }
1659
1660 return result;
1661 }
1662
1663 wxString wxMBConv_iconv::ms_wcCharsetName;
1664 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1665
1666 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1667 : m_name(name)
1668 {
1669 m_minMBCharWidth = 0;
1670
1671 // check for charset that represents wchar_t:
1672 if ( ms_wcCharsetName.empty() )
1673 {
1674 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1675
1676 #if wxUSE_FONTMAP
1677 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1678 #else // !wxUSE_FONTMAP
1679 static const wxChar *names_static[] =
1680 {
1681 #if SIZEOF_WCHAR_T == 4
1682 _T("UCS-4"),
1683 #elif SIZEOF_WCHAR_T = 2
1684 _T("UCS-2"),
1685 #endif
1686 NULL
1687 };
1688 const wxChar **names = names_static;
1689 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1690
1691 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1692 {
1693 const wxString nameCS(*names);
1694
1695 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1696 wxString nameXE(nameCS);
1697
1698 #ifdef WORDS_BIGENDIAN
1699 nameXE += _T("BE");
1700 #else // little endian
1701 nameXE += _T("LE");
1702 #endif
1703
1704 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1705 nameXE.c_str());
1706
1707 m2w = iconv_open(nameXE.ToAscii(), name);
1708 if ( m2w == ICONV_T_INVALID )
1709 {
1710 // try charset w/o bytesex info (e.g. "UCS4")
1711 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1712 nameCS.c_str());
1713 m2w = iconv_open(nameCS.ToAscii(), name);
1714
1715 // and check for bytesex ourselves:
1716 if ( m2w != ICONV_T_INVALID )
1717 {
1718 char buf[2], *bufPtr;
1719 wchar_t wbuf[2], *wbufPtr;
1720 size_t insz, outsz;
1721 size_t res;
1722
1723 buf[0] = 'A';
1724 buf[1] = 0;
1725 wbuf[0] = 0;
1726 insz = 2;
1727 outsz = SIZEOF_WCHAR_T * 2;
1728 wbufPtr = wbuf;
1729 bufPtr = buf;
1730
1731 res = iconv(
1732 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1733 (char**)&wbufPtr, &outsz);
1734
1735 if (ICONV_FAILED(res, insz))
1736 {
1737 wxLogLastError(wxT("iconv"));
1738 wxLogError(_("Conversion to charset '%s' doesn't work."),
1739 nameCS.c_str());
1740 }
1741 else // ok, can convert to this encoding, remember it
1742 {
1743 ms_wcCharsetName = nameCS;
1744 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1745 }
1746 }
1747 }
1748 else // use charset not requiring byte swapping
1749 {
1750 ms_wcCharsetName = nameXE;
1751 }
1752 }
1753
1754 wxLogTrace(TRACE_STRCONV,
1755 wxT("iconv wchar_t charset is \"%s\"%s"),
1756 ms_wcCharsetName.empty() ? wxString("<none>")
1757 : ms_wcCharsetName,
1758 ms_wcNeedsSwap ? _T(" (needs swap)")
1759 : _T(""));
1760 }
1761 else // we already have ms_wcCharsetName
1762 {
1763 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
1764 }
1765
1766 if ( ms_wcCharsetName.empty() )
1767 {
1768 w2m = ICONV_T_INVALID;
1769 }
1770 else
1771 {
1772 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
1773 if ( w2m == ICONV_T_INVALID )
1774 {
1775 wxLogTrace(TRACE_STRCONV,
1776 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1777 ms_wcCharsetName.c_str(), name);
1778 }
1779 }
1780 }
1781
1782 wxMBConv_iconv::~wxMBConv_iconv()
1783 {
1784 if ( m2w != ICONV_T_INVALID )
1785 iconv_close(m2w);
1786 if ( w2m != ICONV_T_INVALID )
1787 iconv_close(w2m);
1788 }
1789
1790 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1791 {
1792 // find the string length: notice that must be done differently for
1793 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1794 size_t inbuf;
1795 const size_t nulLen = GetMBNulLen();
1796 switch ( nulLen )
1797 {
1798 default:
1799 return wxCONV_FAILED;
1800
1801 case 1:
1802 inbuf = strlen(psz); // arguably more optimized than our version
1803 break;
1804
1805 case 2:
1806 case 4:
1807 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1808 // they also have to start at character boundary and not span two
1809 // adjacent characters
1810 const char *p;
1811 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1812 ;
1813 inbuf = p - psz;
1814 break;
1815 }
1816
1817 #if wxUSE_THREADS
1818 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1819 // Unfortunately there are a couple of global wxCSConv objects such as
1820 // wxConvLocal that are used all over wx code, so we have to make sure
1821 // the handle is used by at most one thread at the time. Otherwise
1822 // only a few wx classes would be safe to use from non-main threads
1823 // as MB<->WC conversion would fail "randomly".
1824 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1825 #endif // wxUSE_THREADS
1826
1827 size_t outbuf = n * SIZEOF_WCHAR_T;
1828 size_t res, cres;
1829 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1830 wchar_t *bufPtr = buf;
1831 const char *pszPtr = psz;
1832
1833 if (buf)
1834 {
1835 // have destination buffer, convert there
1836 cres = iconv(m2w,
1837 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1838 (char**)&bufPtr, &outbuf);
1839 res = n - (outbuf / SIZEOF_WCHAR_T);
1840
1841 if (ms_wcNeedsSwap)
1842 {
1843 // convert to native endianness
1844 for ( unsigned i = 0; i < res; i++ )
1845 buf[n] = WC_BSWAP(buf[i]);
1846 }
1847
1848 // NUL-terminate the string if there is any space left
1849 if (res < n)
1850 buf[res] = 0;
1851 }
1852 else
1853 {
1854 // no destination buffer... convert using temp buffer
1855 // to calculate destination buffer requirement
1856 wchar_t tbuf[8];
1857 res = 0;
1858
1859 do
1860 {
1861 bufPtr = tbuf;
1862 outbuf = 8 * SIZEOF_WCHAR_T;
1863
1864 cres = iconv(m2w,
1865 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1866 (char**)&bufPtr, &outbuf );
1867
1868 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1869 }
1870 while ((cres == (size_t)-1) && (errno == E2BIG));
1871 }
1872
1873 if (ICONV_FAILED(cres, inbuf))
1874 {
1875 //VS: it is ok if iconv fails, hence trace only
1876 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1877 return wxCONV_FAILED;
1878 }
1879
1880 return res;
1881 }
1882
1883 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1884 {
1885 #if wxUSE_THREADS
1886 // NB: explained in MB2WC
1887 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1888 #endif
1889
1890 size_t inlen = wxWcslen(psz);
1891 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1892 size_t outbuf = n;
1893 size_t res, cres;
1894
1895 wchar_t *tmpbuf = 0;
1896
1897 if (ms_wcNeedsSwap)
1898 {
1899 // need to copy to temp buffer to switch endianness
1900 // (doing WC_BSWAP twice on the original buffer won't help, as it
1901 // could be in read-only memory, or be accessed in some other thread)
1902 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1903 for ( size_t i = 0; i < inlen; i++ )
1904 tmpbuf[n] = WC_BSWAP(psz[i]);
1905
1906 tmpbuf[inlen] = L'\0';
1907 psz = tmpbuf;
1908 }
1909
1910 if (buf)
1911 {
1912 // have destination buffer, convert there
1913 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1914
1915 res = n - outbuf;
1916
1917 // NB: iconv was given only wcslen(psz) characters on input, and so
1918 // it couldn't convert the trailing zero. Let's do it ourselves
1919 // if there's some room left for it in the output buffer.
1920 if (res < n)
1921 buf[0] = 0;
1922 }
1923 else
1924 {
1925 // no destination buffer: convert using temp buffer
1926 // to calculate destination buffer requirement
1927 char tbuf[16];
1928 res = 0;
1929 do
1930 {
1931 buf = tbuf;
1932 outbuf = 16;
1933
1934 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1935
1936 res += 16 - outbuf;
1937 }
1938 while ((cres == (size_t)-1) && (errno == E2BIG));
1939 }
1940
1941 if (ms_wcNeedsSwap)
1942 {
1943 free(tmpbuf);
1944 }
1945
1946 if (ICONV_FAILED(cres, inbuf))
1947 {
1948 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1949 return wxCONV_FAILED;
1950 }
1951
1952 return res;
1953 }
1954
1955 size_t wxMBConv_iconv::GetMBNulLen() const
1956 {
1957 if ( m_minMBCharWidth == 0 )
1958 {
1959 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1960
1961 #if wxUSE_THREADS
1962 // NB: explained in MB2WC
1963 wxMutexLocker lock(self->m_iconvMutex);
1964 #endif
1965
1966 const wchar_t *wnul = L"";
1967 char buf[8]; // should be enough for NUL in any encoding
1968 size_t inLen = sizeof(wchar_t),
1969 outLen = WXSIZEOF(buf);
1970 char *inBuff = (char *)wnul;
1971 char *outBuff = buf;
1972 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1973 {
1974 self->m_minMBCharWidth = (size_t)-1;
1975 }
1976 else // ok
1977 {
1978 self->m_minMBCharWidth = outBuff - buf;
1979 }
1980 }
1981
1982 return m_minMBCharWidth;
1983 }
1984
1985 #if wxUSE_UNICODE_UTF8
1986 bool wxMBConv_iconv::IsUTF8() const
1987 {
1988 return wxStricmp(m_name, "UTF-8") == 0 ||
1989 wxStricmp(m_name, "UTF8") == 0;
1990 }
1991 #endif
1992
1993 #endif // HAVE_ICONV
1994
1995
1996 // ============================================================================
1997 // Win32 conversion classes
1998 // ============================================================================
1999
2000 #ifdef wxHAVE_WIN32_MB2WC
2001
2002 // from utils.cpp
2003 #if wxUSE_FONTMAP
2004 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2005 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2006 #endif
2007
2008 class wxMBConv_win32 : public wxMBConv
2009 {
2010 public:
2011 wxMBConv_win32()
2012 {
2013 m_CodePage = CP_ACP;
2014 m_minMBCharWidth = 0;
2015 }
2016
2017 wxMBConv_win32(const wxMBConv_win32& conv)
2018 : wxMBConv()
2019 {
2020 m_CodePage = conv.m_CodePage;
2021 m_minMBCharWidth = conv.m_minMBCharWidth;
2022 }
2023
2024 #if wxUSE_FONTMAP
2025 wxMBConv_win32(const char* name)
2026 {
2027 m_CodePage = wxCharsetToCodepage(name);
2028 m_minMBCharWidth = 0;
2029 }
2030
2031 wxMBConv_win32(wxFontEncoding encoding)
2032 {
2033 m_CodePage = wxEncodingToCodepage(encoding);
2034 m_minMBCharWidth = 0;
2035 }
2036 #endif // wxUSE_FONTMAP
2037
2038 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2039 {
2040 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2041 // the behaviour is not compatible with the Unix version (using iconv)
2042 // and break the library itself, e.g. wxTextInputStream::NextChar()
2043 // wouldn't work if reading an incomplete MB char didn't result in an
2044 // error
2045 //
2046 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2047 // Win XP or newer and it is not supported for UTF-[78] so we always
2048 // use our own conversions in this case. See
2049 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2050 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2051 if ( m_CodePage == CP_UTF8 )
2052 {
2053 return wxMBConvUTF8().MB2WC(buf, psz, n);
2054 }
2055
2056 if ( m_CodePage == CP_UTF7 )
2057 {
2058 return wxMBConvUTF7().MB2WC(buf, psz, n);
2059 }
2060
2061 int flags = 0;
2062 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2063 IsAtLeastWin2kSP4() )
2064 {
2065 flags = MB_ERR_INVALID_CHARS;
2066 }
2067
2068 const size_t len = ::MultiByteToWideChar
2069 (
2070 m_CodePage, // code page
2071 flags, // flags: fall on error
2072 psz, // input string
2073 -1, // its length (NUL-terminated)
2074 buf, // output string
2075 buf ? n : 0 // size of output buffer
2076 );
2077 if ( !len )
2078 {
2079 // function totally failed
2080 return wxCONV_FAILED;
2081 }
2082
2083 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2084 // check if we succeeded, by doing a double trip:
2085 if ( !flags && buf )
2086 {
2087 const size_t mbLen = strlen(psz);
2088 wxCharBuffer mbBuf(mbLen);
2089 if ( ::WideCharToMultiByte
2090 (
2091 m_CodePage,
2092 0,
2093 buf,
2094 -1,
2095 mbBuf.data(),
2096 mbLen + 1, // size in bytes, not length
2097 NULL,
2098 NULL
2099 ) == 0 ||
2100 strcmp(mbBuf, psz) != 0 )
2101 {
2102 // we didn't obtain the same thing we started from, hence
2103 // the conversion was lossy and we consider that it failed
2104 return wxCONV_FAILED;
2105 }
2106 }
2107
2108 // note that it returns count of written chars for buf != NULL and size
2109 // of the needed buffer for buf == NULL so in either case the length of
2110 // the string (which never includes the terminating NUL) is one less
2111 return len - 1;
2112 }
2113
2114 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2115 {
2116 /*
2117 we have a problem here: by default, WideCharToMultiByte() may
2118 replace characters unrepresentable in the target code page with bad
2119 quality approximations such as turning "1/2" symbol (U+00BD) into
2120 "1" for the code pages which don't have it and we, obviously, want
2121 to avoid this at any price
2122
2123 the trouble is that this function does it _silently_, i.e. it won't
2124 even tell us whether it did or not... Win98/2000 and higher provide
2125 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2126 we have to resort to a round trip, i.e. check that converting back
2127 results in the same string -- this is, of course, expensive but
2128 otherwise we simply can't be sure to not garble the data.
2129 */
2130
2131 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2132 // it doesn't work with CJK encodings (which we test for rather roughly
2133 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2134 // supporting it
2135 BOOL usedDef wxDUMMY_INITIALIZE(false);
2136 BOOL *pUsedDef;
2137 int flags;
2138 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2139 {
2140 // it's our lucky day
2141 flags = WC_NO_BEST_FIT_CHARS;
2142 pUsedDef = &usedDef;
2143 }
2144 else // old system or unsupported encoding
2145 {
2146 flags = 0;
2147 pUsedDef = NULL;
2148 }
2149
2150 const size_t len = ::WideCharToMultiByte
2151 (
2152 m_CodePage, // code page
2153 flags, // either none or no best fit
2154 pwz, // input string
2155 -1, // it is (wide) NUL-terminated
2156 buf, // output buffer
2157 buf ? n : 0, // and its size
2158 NULL, // default "replacement" char
2159 pUsedDef // [out] was it used?
2160 );
2161
2162 if ( !len )
2163 {
2164 // function totally failed
2165 return wxCONV_FAILED;
2166 }
2167
2168 // if we were really converting, check if we succeeded
2169 if ( buf )
2170 {
2171 if ( flags )
2172 {
2173 // check if the conversion failed, i.e. if any replacements
2174 // were done
2175 if ( usedDef )
2176 return wxCONV_FAILED;
2177 }
2178 else // we must resort to double tripping...
2179 {
2180 wxWCharBuffer wcBuf(n);
2181 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2182 wcscmp(wcBuf, pwz) != 0 )
2183 {
2184 // we didn't obtain the same thing we started from, hence
2185 // the conversion was lossy and we consider that it failed
2186 return wxCONV_FAILED;
2187 }
2188 }
2189 }
2190
2191 // see the comment above for the reason of "len - 1"
2192 return len - 1;
2193 }
2194
2195 virtual size_t GetMBNulLen() const
2196 {
2197 if ( m_minMBCharWidth == 0 )
2198 {
2199 int len = ::WideCharToMultiByte
2200 (
2201 m_CodePage, // code page
2202 0, // no flags
2203 L"", // input string
2204 1, // translate just the NUL
2205 NULL, // output buffer
2206 0, // and its size
2207 NULL, // no replacement char
2208 NULL // [out] don't care if it was used
2209 );
2210
2211 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2212 switch ( len )
2213 {
2214 default:
2215 wxLogDebug(_T("Unexpected NUL length %d"), len);
2216 self->m_minMBCharWidth = (size_t)-1;
2217 break;
2218
2219 case 0:
2220 self->m_minMBCharWidth = (size_t)-1;
2221 break;
2222
2223 case 1:
2224 case 2:
2225 case 4:
2226 self->m_minMBCharWidth = len;
2227 break;
2228 }
2229 }
2230
2231 return m_minMBCharWidth;
2232 }
2233
2234 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2235
2236 bool IsOk() const { return m_CodePage != -1; }
2237
2238 private:
2239 static bool CanUseNoBestFit()
2240 {
2241 static int s_isWin98Or2k = -1;
2242
2243 if ( s_isWin98Or2k == -1 )
2244 {
2245 int verMaj, verMin;
2246 switch ( wxGetOsVersion(&verMaj, &verMin) )
2247 {
2248 case wxOS_WINDOWS_9X:
2249 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2250 break;
2251
2252 case wxOS_WINDOWS_NT:
2253 s_isWin98Or2k = verMaj >= 5;
2254 break;
2255
2256 default:
2257 // unknown: be conservative by default
2258 s_isWin98Or2k = 0;
2259 break;
2260 }
2261
2262 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2263 }
2264
2265 return s_isWin98Or2k == 1;
2266 }
2267
2268 static bool IsAtLeastWin2kSP4()
2269 {
2270 #ifdef __WXWINCE__
2271 return false;
2272 #else
2273 static int s_isAtLeastWin2kSP4 = -1;
2274
2275 if ( s_isAtLeastWin2kSP4 == -1 )
2276 {
2277 OSVERSIONINFOEX ver;
2278
2279 memset(&ver, 0, sizeof(ver));
2280 ver.dwOSVersionInfoSize = sizeof(ver);
2281 GetVersionEx((OSVERSIONINFO*)&ver);
2282
2283 s_isAtLeastWin2kSP4 =
2284 ((ver.dwMajorVersion > 5) || // Vista+
2285 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2286 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2287 ver.wServicePackMajor >= 4)) // 2000 SP4+
2288 ? 1 : 0;
2289 }
2290
2291 return s_isAtLeastWin2kSP4 == 1;
2292 #endif
2293 }
2294
2295
2296 // the code page we're working with
2297 long m_CodePage;
2298
2299 // cached result of GetMBNulLen(), set to 0 initially meaning
2300 // "unknown"
2301 size_t m_minMBCharWidth;
2302 };
2303
2304 #endif // wxHAVE_WIN32_MB2WC
2305
2306 // ============================================================================
2307 // CoreFoundation conversion classes
2308 // ============================================================================
2309
2310 #ifdef __DARWIN__
2311
2312 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2313 {
2314 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2315
2316 switch (encoding)
2317 {
2318 case wxFONTENCODING_DEFAULT :
2319 enc = CFStringGetSystemEncoding();
2320 break ;
2321
2322 case wxFONTENCODING_ISO8859_1 :
2323 enc = kCFStringEncodingISOLatin1 ;
2324 break ;
2325 case wxFONTENCODING_ISO8859_2 :
2326 enc = kCFStringEncodingISOLatin2;
2327 break ;
2328 case wxFONTENCODING_ISO8859_3 :
2329 enc = kCFStringEncodingISOLatin3 ;
2330 break ;
2331 case wxFONTENCODING_ISO8859_4 :
2332 enc = kCFStringEncodingISOLatin4;
2333 break ;
2334 case wxFONTENCODING_ISO8859_5 :
2335 enc = kCFStringEncodingISOLatinCyrillic;
2336 break ;
2337 case wxFONTENCODING_ISO8859_6 :
2338 enc = kCFStringEncodingISOLatinArabic;
2339 break ;
2340 case wxFONTENCODING_ISO8859_7 :
2341 enc = kCFStringEncodingISOLatinGreek;
2342 break ;
2343 case wxFONTENCODING_ISO8859_8 :
2344 enc = kCFStringEncodingISOLatinHebrew;
2345 break ;
2346 case wxFONTENCODING_ISO8859_9 :
2347 enc = kCFStringEncodingISOLatin5;
2348 break ;
2349 case wxFONTENCODING_ISO8859_10 :
2350 enc = kCFStringEncodingISOLatin6;
2351 break ;
2352 case wxFONTENCODING_ISO8859_11 :
2353 enc = kCFStringEncodingISOLatinThai;
2354 break ;
2355 case wxFONTENCODING_ISO8859_13 :
2356 enc = kCFStringEncodingISOLatin7;
2357 break ;
2358 case wxFONTENCODING_ISO8859_14 :
2359 enc = kCFStringEncodingISOLatin8;
2360 break ;
2361 case wxFONTENCODING_ISO8859_15 :
2362 enc = kCFStringEncodingISOLatin9;
2363 break ;
2364
2365 case wxFONTENCODING_KOI8 :
2366 enc = kCFStringEncodingKOI8_R;
2367 break ;
2368 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2369 enc = kCFStringEncodingDOSRussian;
2370 break ;
2371
2372 // case wxFONTENCODING_BULGARIAN :
2373 // enc = ;
2374 // break ;
2375
2376 case wxFONTENCODING_CP437 :
2377 enc = kCFStringEncodingDOSLatinUS ;
2378 break ;
2379 case wxFONTENCODING_CP850 :
2380 enc = kCFStringEncodingDOSLatin1;
2381 break ;
2382 case wxFONTENCODING_CP852 :
2383 enc = kCFStringEncodingDOSLatin2;
2384 break ;
2385 case wxFONTENCODING_CP855 :
2386 enc = kCFStringEncodingDOSCyrillic;
2387 break ;
2388 case wxFONTENCODING_CP866 :
2389 enc = kCFStringEncodingDOSRussian ;
2390 break ;
2391 case wxFONTENCODING_CP874 :
2392 enc = kCFStringEncodingDOSThai;
2393 break ;
2394 case wxFONTENCODING_CP932 :
2395 enc = kCFStringEncodingDOSJapanese;
2396 break ;
2397 case wxFONTENCODING_CP936 :
2398 enc = kCFStringEncodingDOSChineseSimplif ;
2399 break ;
2400 case wxFONTENCODING_CP949 :
2401 enc = kCFStringEncodingDOSKorean;
2402 break ;
2403 case wxFONTENCODING_CP950 :
2404 enc = kCFStringEncodingDOSChineseTrad;
2405 break ;
2406 case wxFONTENCODING_CP1250 :
2407 enc = kCFStringEncodingWindowsLatin2;
2408 break ;
2409 case wxFONTENCODING_CP1251 :
2410 enc = kCFStringEncodingWindowsCyrillic ;
2411 break ;
2412 case wxFONTENCODING_CP1252 :
2413 enc = kCFStringEncodingWindowsLatin1 ;
2414 break ;
2415 case wxFONTENCODING_CP1253 :
2416 enc = kCFStringEncodingWindowsGreek;
2417 break ;
2418 case wxFONTENCODING_CP1254 :
2419 enc = kCFStringEncodingWindowsLatin5;
2420 break ;
2421 case wxFONTENCODING_CP1255 :
2422 enc = kCFStringEncodingWindowsHebrew ;
2423 break ;
2424 case wxFONTENCODING_CP1256 :
2425 enc = kCFStringEncodingWindowsArabic ;
2426 break ;
2427 case wxFONTENCODING_CP1257 :
2428 enc = kCFStringEncodingWindowsBalticRim;
2429 break ;
2430 // This only really encodes to UTF7 (if that) evidently
2431 // case wxFONTENCODING_UTF7 :
2432 // enc = kCFStringEncodingNonLossyASCII ;
2433 // break ;
2434 case wxFONTENCODING_UTF8 :
2435 enc = kCFStringEncodingUTF8 ;
2436 break ;
2437 case wxFONTENCODING_EUC_JP :
2438 enc = kCFStringEncodingEUC_JP;
2439 break ;
2440 /* Don't support conversion to/from UTF16 as wxWidgets can do this better.
2441 * In particular, ToWChar would fail miserably using strlen on an input UTF16.
2442 case wxFONTENCODING_UTF16 :
2443 enc = kCFStringEncodingUnicode ;
2444 break ;
2445 */
2446 case wxFONTENCODING_MACROMAN :
2447 enc = kCFStringEncodingMacRoman ;
2448 break ;
2449 case wxFONTENCODING_MACJAPANESE :
2450 enc = kCFStringEncodingMacJapanese ;
2451 break ;
2452 case wxFONTENCODING_MACCHINESETRAD :
2453 enc = kCFStringEncodingMacChineseTrad ;
2454 break ;
2455 case wxFONTENCODING_MACKOREAN :
2456 enc = kCFStringEncodingMacKorean ;
2457 break ;
2458 case wxFONTENCODING_MACARABIC :
2459 enc = kCFStringEncodingMacArabic ;
2460 break ;
2461 case wxFONTENCODING_MACHEBREW :
2462 enc = kCFStringEncodingMacHebrew ;
2463 break ;
2464 case wxFONTENCODING_MACGREEK :
2465 enc = kCFStringEncodingMacGreek ;
2466 break ;
2467 case wxFONTENCODING_MACCYRILLIC :
2468 enc = kCFStringEncodingMacCyrillic ;
2469 break ;
2470 case wxFONTENCODING_MACDEVANAGARI :
2471 enc = kCFStringEncodingMacDevanagari ;
2472 break ;
2473 case wxFONTENCODING_MACGURMUKHI :
2474 enc = kCFStringEncodingMacGurmukhi ;
2475 break ;
2476 case wxFONTENCODING_MACGUJARATI :
2477 enc = kCFStringEncodingMacGujarati ;
2478 break ;
2479 case wxFONTENCODING_MACORIYA :
2480 enc = kCFStringEncodingMacOriya ;
2481 break ;
2482 case wxFONTENCODING_MACBENGALI :
2483 enc = kCFStringEncodingMacBengali ;
2484 break ;
2485 case wxFONTENCODING_MACTAMIL :
2486 enc = kCFStringEncodingMacTamil ;
2487 break ;
2488 case wxFONTENCODING_MACTELUGU :
2489 enc = kCFStringEncodingMacTelugu ;
2490 break ;
2491 case wxFONTENCODING_MACKANNADA :
2492 enc = kCFStringEncodingMacKannada ;
2493 break ;
2494 case wxFONTENCODING_MACMALAJALAM :
2495 enc = kCFStringEncodingMacMalayalam ;
2496 break ;
2497 case wxFONTENCODING_MACSINHALESE :
2498 enc = kCFStringEncodingMacSinhalese ;
2499 break ;
2500 case wxFONTENCODING_MACBURMESE :
2501 enc = kCFStringEncodingMacBurmese ;
2502 break ;
2503 case wxFONTENCODING_MACKHMER :
2504 enc = kCFStringEncodingMacKhmer ;
2505 break ;
2506 case wxFONTENCODING_MACTHAI :
2507 enc = kCFStringEncodingMacThai ;
2508 break ;
2509 case wxFONTENCODING_MACLAOTIAN :
2510 enc = kCFStringEncodingMacLaotian ;
2511 break ;
2512 case wxFONTENCODING_MACGEORGIAN :
2513 enc = kCFStringEncodingMacGeorgian ;
2514 break ;
2515 case wxFONTENCODING_MACARMENIAN :
2516 enc = kCFStringEncodingMacArmenian ;
2517 break ;
2518 case wxFONTENCODING_MACCHINESESIMP :
2519 enc = kCFStringEncodingMacChineseSimp ;
2520 break ;
2521 case wxFONTENCODING_MACTIBETAN :
2522 enc = kCFStringEncodingMacTibetan ;
2523 break ;
2524 case wxFONTENCODING_MACMONGOLIAN :
2525 enc = kCFStringEncodingMacMongolian ;
2526 break ;
2527 case wxFONTENCODING_MACETHIOPIC :
2528 enc = kCFStringEncodingMacEthiopic ;
2529 break ;
2530 case wxFONTENCODING_MACCENTRALEUR :
2531 enc = kCFStringEncodingMacCentralEurRoman ;
2532 break ;
2533 case wxFONTENCODING_MACVIATNAMESE :
2534 enc = kCFStringEncodingMacVietnamese ;
2535 break ;
2536 case wxFONTENCODING_MACARABICEXT :
2537 enc = kCFStringEncodingMacExtArabic ;
2538 break ;
2539 case wxFONTENCODING_MACSYMBOL :
2540 enc = kCFStringEncodingMacSymbol ;
2541 break ;
2542 case wxFONTENCODING_MACDINGBATS :
2543 enc = kCFStringEncodingMacDingbats ;
2544 break ;
2545 case wxFONTENCODING_MACTURKISH :
2546 enc = kCFStringEncodingMacTurkish ;
2547 break ;
2548 case wxFONTENCODING_MACCROATIAN :
2549 enc = kCFStringEncodingMacCroatian ;
2550 break ;
2551 case wxFONTENCODING_MACICELANDIC :
2552 enc = kCFStringEncodingMacIcelandic ;
2553 break ;
2554 case wxFONTENCODING_MACROMANIAN :
2555 enc = kCFStringEncodingMacRomanian ;
2556 break ;
2557 case wxFONTENCODING_MACCELTIC :
2558 enc = kCFStringEncodingMacCeltic ;
2559 break ;
2560 case wxFONTENCODING_MACGAELIC :
2561 enc = kCFStringEncodingMacGaelic ;
2562 break ;
2563 // case wxFONTENCODING_MACKEYBOARD :
2564 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2565 // break ;
2566
2567 default :
2568 // because gcc is picky
2569 break ;
2570 }
2571
2572 return enc ;
2573 }
2574
2575 #if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4
2576 // Provide a constant for the wchat_t encoding used by the host platform.
2577 #ifdef WORDS_BIGENDIAN
2578 static const CFStringEncoding wxCFStringEncodingWcharT = kCFStringEncodingUTF32BE;
2579 #else
2580 static const CFStringEncoding wxCFStringEncodingWcharT = kCFStringEncodingUTF32LE;
2581 #endif
2582
2583 #endif /* MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4 */
2584
2585 class wxMBConv_cf : public wxMBConv
2586 {
2587 public:
2588 wxMBConv_cf()
2589 {
2590 Init(CFStringGetSystemEncoding()) ;
2591 }
2592
2593 wxMBConv_cf(const wxMBConv_cf& conv)
2594 {
2595 m_encoding = conv.m_encoding;
2596 }
2597
2598 #if wxUSE_FONTMAP
2599 wxMBConv_cf(const char* name)
2600 {
2601 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2602 }
2603 #endif
2604
2605 wxMBConv_cf(wxFontEncoding encoding)
2606 {
2607 Init( wxCFStringEncFromFontEnc(encoding) );
2608 }
2609
2610 virtual ~wxMBConv_cf()
2611 {
2612 }
2613
2614 void Init( CFStringEncoding encoding)
2615 {
2616 m_encoding = encoding ;
2617 }
2618
2619 virtual size_t ToWChar(wchar_t * dst, size_t dstSize, const char * src, size_t srcSize = wxNO_LEN) const
2620 {
2621 wxCHECK(src, wxCONV_FAILED);
2622
2623 /* NOTE: This is wrong if the source encoding has an element size
2624 * other than char (e.g. it's kCFStringEncodingUnicode)
2625 * If the user specifies it, it's presumably right though.
2626 * Right now we don't support UTF-16 in anyway since wx can do a better job.
2627 */
2628 if(srcSize == wxNO_LEN)
2629 srcSize = strlen(src) + 1;
2630
2631 // First create the temporary CFString
2632 wxCFRef<CFStringRef> theString( CFStringCreateWithBytes (
2633 NULL, //the allocator
2634 (const UInt8*)src,
2635 srcSize,
2636 m_encoding,
2637 false //no BOM/external representation
2638 ));
2639
2640 wxCHECK(theString != NULL, wxCONV_FAILED);
2641
2642 /* NOTE: The string content includes the NULL element if the source string did
2643 * That means we have to do nothing special because the destination will have
2644 * the NULL element iff the source did and the NULL element will be included
2645 * in the count iff it was included in the source count.
2646 */
2647
2648
2649 /* If we're compiling against Tiger headers we can support direct conversion
2650 * to UTF32. If we are then run against a pre-Tiger system, the encoding
2651 * won't be available so we'll defer to the string->UTF-16->UTF-32 conversion.
2652 */
2653 #if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4
2654 if(CFStringIsEncodingAvailable(wxCFStringEncodingWcharT))
2655 {
2656 CFRange fullStringRange = CFRangeMake(0, CFStringGetLength(theString));
2657 CFIndex usedBufLen;
2658
2659 CFIndex charsConverted = CFStringGetBytes(
2660 theString,
2661 fullStringRange,
2662 wxCFStringEncodingWcharT,
2663 0,
2664 false,
2665 // if dstSize is 0 then pass NULL to get required length in usedBufLen
2666 dstSize != 0?(UInt8*)dst:NULL,
2667 dstSize * sizeof(wchar_t),
2668 &usedBufLen);
2669
2670 // charsConverted is > 0 iff conversion succeeded
2671 if(charsConverted <= 0)
2672 return wxCONV_FAILED;
2673
2674 /* usedBufLen is the number of bytes written, so we divide by
2675 * sizeof(wchar_t) to get the number of elements written.
2676 */
2677 wxASSERT( (usedBufLen % sizeof(wchar_t)) == 0 );
2678
2679 // CFStringGetBytes does exactly the right thing when buffer
2680 // pointer is NULL and returns the number of bytes required
2681 return usedBufLen / sizeof(wchar_t);
2682 }
2683 else
2684 #endif /* MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4 */
2685 {
2686 // NOTE: Includes NULL iff source did
2687 /* NOTE: This is an approximation. The eventual UTF-32 will
2688 * possibly have less elements but certainly not more.
2689 */
2690 size_t returnSize = CFStringGetLength(theString);
2691
2692 if (dstSize == 0 || dst == NULL)
2693 {
2694 return returnSize;
2695 }
2696
2697 // Convert the entire string.. too hard to figure out how many UTF-16 we'd need
2698 // for an undersized UTF-32 destination buffer.
2699 CFRange fullStringRange = CFRangeMake(0, CFStringGetLength(theString));
2700 UniChar *szUniCharBuffer = new UniChar[fullStringRange.length];
2701
2702 CFStringGetCharacters(theString, fullStringRange, szUniCharBuffer);
2703
2704 wxMBConvUTF16 converter;
2705 returnSize = converter.ToWChar( dst, dstSize, (const char*)szUniCharBuffer, fullStringRange.length );
2706 delete [] szUniCharBuffer;
2707
2708 return returnSize;
2709 }
2710 // NOTREACHED
2711 }
2712
2713 virtual size_t FromWChar(char *dst, size_t dstSize, const wchar_t *src, size_t srcSize) const
2714 {
2715 wxCHECK(src, wxCONV_FAILED);
2716
2717 if(srcSize == wxNO_LEN)
2718 srcSize = wxStrlen(src) + 1;
2719
2720 // Temporary CFString
2721 wxCFRef<CFStringRef> theString;
2722
2723 /* If we're compiling against Tiger headers we can support direct conversion
2724 * from UTF32. If we are then run against a pre-Tiger system, the encoding
2725 * won't be available so we'll defer to the UTF-32->UTF-16->string conversion.
2726 */
2727 #if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4
2728 if(CFStringIsEncodingAvailable(wxCFStringEncodingWcharT))
2729 {
2730 theString = wxCFRef<CFStringRef>(CFStringCreateWithBytes(
2731 kCFAllocatorDefault,
2732 (UInt8*)src,
2733 srcSize * sizeof(wchar_t),
2734 wxCFStringEncodingWcharT,
2735 false));
2736 }
2737 else
2738 #endif /* MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_4 */
2739 {
2740 wxMBConvUTF16 converter;
2741 size_t cbUniBuffer = converter.FromWChar( NULL, 0, src, srcSize );
2742 wxASSERT(cbUniBuffer % sizeof(UniChar));
2743
2744 // Will be free'd by kCFAllocatorMalloc when CFString is released
2745 UniChar *tmpUniBuffer = (UniChar*)malloc(cbUniBuffer);
2746
2747 cbUniBuffer = converter.FromWChar( (char*) tmpUniBuffer, cbUniBuffer, src, srcSize );
2748 wxASSERT(cbUniBuffer % sizeof(UniChar));
2749
2750 theString = wxCFRef<CFStringRef>(CFStringCreateWithCharactersNoCopy(
2751 kCFAllocatorDefault,
2752 tmpUniBuffer,
2753 cbUniBuffer / sizeof(UniChar),
2754 kCFAllocatorMalloc
2755 ));
2756
2757 }
2758
2759 wxCHECK(theString != NULL, wxCONV_FAILED);
2760
2761 CFIndex usedBufLen;
2762
2763 CFIndex charsConverted = CFStringGetBytes(
2764 theString,
2765 CFRangeMake(0, CFStringGetLength(theString)),
2766 m_encoding,
2767 0, // FAIL on unconvertible characters
2768 false, // not an external representation
2769 // if dstSize is 0 then pass NULL to get required length in usedBufLen
2770 (dstSize != 0)?(UInt8*)dst:NULL,
2771 dstSize,
2772 &usedBufLen
2773 );
2774
2775 // charsConverted is > 0 iff conversion succeeded
2776 if(charsConverted <= 0)
2777 return wxCONV_FAILED;
2778
2779 return usedBufLen;
2780 }
2781
2782 virtual wxMBConv *Clone() const { return new wxMBConv_cf(*this); }
2783
2784 bool IsOk() const
2785 {
2786 return m_encoding != kCFStringEncodingInvalidId &&
2787 CFStringIsEncodingAvailable(m_encoding);
2788 }
2789
2790 private:
2791 CFStringEncoding m_encoding ;
2792 };
2793
2794 #endif // __DARWIN__
2795
2796 // ============================================================================
2797 // Mac conversion classes
2798 // ============================================================================
2799
2800 /* Although we are in the base library we currently have this wxMac
2801 * conditional. This is not generally good but fortunately does not affect
2802 * the ABI of the base library, only what encodings might work.
2803 * It does mean that a wxBase built as part of wxMac has slightly more support
2804 * than one built for wxCocoa or even wxGtk.
2805 */
2806 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2807
2808 class wxMBConv_mac : public wxMBConv
2809 {
2810 public:
2811 wxMBConv_mac()
2812 {
2813 Init(CFStringGetSystemEncoding()) ;
2814 }
2815
2816 wxMBConv_mac(const wxMBConv_mac& conv)
2817 {
2818 Init(conv.m_char_encoding);
2819 }
2820
2821 #if wxUSE_FONTMAP
2822 wxMBConv_mac(const char* name)
2823 {
2824 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2825 }
2826 #endif
2827
2828 wxMBConv_mac(wxFontEncoding encoding)
2829 {
2830 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2831 }
2832
2833 virtual ~wxMBConv_mac()
2834 {
2835 OSStatus status = noErr ;
2836 if (m_MB2WC_converter)
2837 status = TECDisposeConverter(m_MB2WC_converter);
2838 if (m_WC2MB_converter)
2839 status = TECDisposeConverter(m_WC2MB_converter);
2840 }
2841
2842 void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2843 TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2844 {
2845 m_MB2WC_converter = NULL ;
2846 m_WC2MB_converter = NULL ;
2847 m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2848 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2849 }
2850
2851 virtual void CreateIfNeeded() const
2852 {
2853 if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2854 {
2855 OSStatus status = noErr ;
2856 status = TECCreateConverter(&m_MB2WC_converter,
2857 m_char_encoding,
2858 m_unicode_encoding);
2859 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2860 status = TECCreateConverter(&m_WC2MB_converter,
2861 m_unicode_encoding,
2862 m_char_encoding);
2863 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2864 }
2865 }
2866
2867 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2868 {
2869 CreateIfNeeded() ;
2870 OSStatus status = noErr ;
2871 ByteCount byteOutLen ;
2872 ByteCount byteInLen = strlen(psz) + 1;
2873 wchar_t *tbuf = NULL ;
2874 UniChar* ubuf = NULL ;
2875 size_t res = 0 ;
2876
2877 if (buf == NULL)
2878 {
2879 // Apple specs say at least 32
2880 n = wxMax( 32, byteInLen ) ;
2881 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2882 }
2883
2884 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2885
2886 #if SIZEOF_WCHAR_T == 4
2887 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2888 #else
2889 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2890 #endif
2891
2892 status = TECConvertText(
2893 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2894 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2895
2896 #if SIZEOF_WCHAR_T == 4
2897 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2898 // is not properly terminated we get random characters at the end
2899 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2900 wxMBConvUTF16 converter ;
2901 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2902 free( ubuf ) ;
2903 #else
2904 res = byteOutLen / sizeof( UniChar ) ;
2905 #endif
2906
2907 if ( buf == NULL )
2908 free(tbuf) ;
2909
2910 if ( buf && res < n)
2911 buf[res] = 0;
2912
2913 return res ;
2914 }
2915
2916 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2917 {
2918 CreateIfNeeded() ;
2919 OSStatus status = noErr ;
2920 ByteCount byteOutLen ;
2921 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2922
2923 char *tbuf = NULL ;
2924
2925 if (buf == NULL)
2926 {
2927 // Apple specs say at least 32
2928 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2929 tbuf = (char*) malloc( n ) ;
2930 }
2931
2932 ByteCount byteBufferLen = n ;
2933 UniChar* ubuf = NULL ;
2934
2935 #if SIZEOF_WCHAR_T == 4
2936 wxMBConvUTF16 converter ;
2937 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2938 byteInLen = unicharlen ;
2939 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2940 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2941 #else
2942 ubuf = (UniChar*) psz ;
2943 #endif
2944
2945 status = TECConvertText(
2946 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2947 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2948
2949 #if SIZEOF_WCHAR_T == 4
2950 free( ubuf ) ;
2951 #endif
2952
2953 if ( buf == NULL )
2954 free(tbuf) ;
2955
2956 size_t res = byteOutLen ;
2957 if ( buf && res < n)
2958 {
2959 buf[res] = 0;
2960
2961 //we need to double-trip to verify it didn't insert any ? in place
2962 //of bogus characters
2963 wxWCharBuffer wcBuf(n);
2964 size_t pszlen = wxWcslen(psz);
2965 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2966 wxWcslen(wcBuf) != pszlen ||
2967 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2968 {
2969 // we didn't obtain the same thing we started from, hence
2970 // the conversion was lossy and we consider that it failed
2971 return wxCONV_FAILED;
2972 }
2973 }
2974
2975 return res ;
2976 }
2977
2978 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2979
2980 bool IsOk() const
2981 {
2982 CreateIfNeeded() ;
2983 return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2984 }
2985
2986 protected :
2987 mutable TECObjectRef m_MB2WC_converter;
2988 mutable TECObjectRef m_WC2MB_converter;
2989
2990 TextEncodingBase m_char_encoding;
2991 TextEncodingBase m_unicode_encoding;
2992 };
2993
2994 // MB is decomposed (D) normalized UTF8
2995
2996 class wxMBConv_macUTF8D : public wxMBConv_mac
2997 {
2998 public :
2999 wxMBConv_macUTF8D()
3000 {
3001 Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
3002 m_uni = NULL;
3003 m_uniBack = NULL ;
3004 }
3005
3006 virtual ~wxMBConv_macUTF8D()
3007 {
3008 if (m_uni!=NULL)
3009 DisposeUnicodeToTextInfo(&m_uni);
3010 if (m_uniBack!=NULL)
3011 DisposeUnicodeToTextInfo(&m_uniBack);
3012 }
3013
3014 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
3015 {
3016 CreateIfNeeded() ;
3017 OSStatus status = noErr ;
3018 ByteCount byteOutLen ;
3019 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
3020
3021 char *tbuf = NULL ;
3022
3023 if (buf == NULL)
3024 {
3025 // Apple specs say at least 32
3026 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
3027 tbuf = (char*) malloc( n ) ;
3028 }
3029
3030 ByteCount byteBufferLen = n ;
3031 UniChar* ubuf = NULL ;
3032
3033 #if SIZEOF_WCHAR_T == 4
3034 wxMBConvUTF16 converter ;
3035 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
3036 byteInLen = unicharlen ;
3037 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
3038 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
3039 #else
3040 ubuf = (UniChar*) psz ;
3041 #endif
3042
3043 // ubuf is a non-decomposed UniChar buffer
3044
3045 ByteCount dcubuflen = byteInLen * 2 + 2 ;
3046 ByteCount dcubufread , dcubufwritten ;
3047 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3048
3049 ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
3050 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , dcubuf ) ;
3051
3052 // we now convert that decomposed buffer into UTF8
3053
3054 status = TECConvertText(
3055 m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
3056 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
3057
3058 free( dcubuf );
3059
3060 #if SIZEOF_WCHAR_T == 4
3061 free( ubuf ) ;
3062 #endif
3063
3064 if ( buf == NULL )
3065 free(tbuf) ;
3066
3067 size_t res = byteOutLen ;
3068 if ( buf && res < n)
3069 {
3070 buf[res] = 0;
3071 // don't test for round-trip fidelity yet, we cannot guarantee it yet
3072 }
3073
3074 return res ;
3075 }
3076
3077 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
3078 {
3079 CreateIfNeeded() ;
3080 OSStatus status = noErr ;
3081 ByteCount byteOutLen ;
3082 ByteCount byteInLen = strlen(psz) + 1;
3083 wchar_t *tbuf = NULL ;
3084 UniChar* ubuf = NULL ;
3085 size_t res = 0 ;
3086
3087 if (buf == NULL)
3088 {
3089 // Apple specs say at least 32
3090 n = wxMax( 32, byteInLen ) ;
3091 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
3092 }
3093
3094 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
3095
3096 #if SIZEOF_WCHAR_T == 4
3097 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
3098 #else
3099 ubuf = (UniChar*) (buf ? buf : tbuf) ;
3100 #endif
3101
3102 ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
3103 ByteCount dcubufread , dcubufwritten ;
3104 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3105
3106 status = TECConvertText(
3107 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
3108 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
3109 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3110 // is not properly terminated we get random characters at the end
3111 dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3112
3113 // now from the decomposed UniChar to properly composed uniChar
3114 ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
3115 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , ubuf ) ;
3116
3117 free( dcubuf );
3118 byteOutLen = dcubufwritten ;
3119 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3120
3121
3122 #if SIZEOF_WCHAR_T == 4
3123 wxMBConvUTF16 converter ;
3124 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
3125 free( ubuf ) ;
3126 #else
3127 res = byteOutLen / sizeof( UniChar ) ;
3128 #endif
3129
3130 if ( buf == NULL )
3131 free(tbuf) ;
3132
3133 if ( buf && res < n)
3134 buf[res] = 0;
3135
3136 return res ;
3137 }
3138
3139 virtual void CreateIfNeeded() const
3140 {
3141 wxMBConv_mac::CreateIfNeeded() ;
3142 if ( m_uni == NULL )
3143 {
3144 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3145 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3146 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3147 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
3148 m_map.mappingVersion = kUnicodeUseLatestMapping;
3149
3150 OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
3151 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3152
3153 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3154 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3155 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3156 kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
3157 m_map.mappingVersion = kUnicodeUseLatestMapping;
3158 err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
3159 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3160 }
3161 }
3162 protected :
3163 mutable UnicodeToTextInfo m_uni;
3164 mutable UnicodeToTextInfo m_uniBack;
3165 mutable UnicodeMapping m_map;
3166 };
3167 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3168
3169 // ============================================================================
3170 // wxEncodingConverter based conversion classes
3171 // ============================================================================
3172
3173 #if wxUSE_FONTMAP
3174
3175 class wxMBConv_wxwin : public wxMBConv
3176 {
3177 private:
3178 void Init()
3179 {
3180 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
3181 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
3182 }
3183
3184 public:
3185 // temporarily just use wxEncodingConverter stuff,
3186 // so that it works while a better implementation is built
3187 wxMBConv_wxwin(const char* name)
3188 {
3189 if (name)
3190 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3191 else
3192 m_enc = wxFONTENCODING_SYSTEM;
3193
3194 Init();
3195 }
3196
3197 wxMBConv_wxwin(wxFontEncoding enc)
3198 {
3199 m_enc = enc;
3200
3201 Init();
3202 }
3203
3204 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
3205 {
3206 size_t inbuf = strlen(psz);
3207 if (buf)
3208 {
3209 if (!m2w.Convert(psz, buf))
3210 return wxCONV_FAILED;
3211 }
3212 return inbuf;
3213 }
3214
3215 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
3216 {
3217 const size_t inbuf = wxWcslen(psz);
3218 if (buf)
3219 {
3220 if (!w2m.Convert(psz, buf))
3221 return wxCONV_FAILED;
3222 }
3223
3224 return inbuf;
3225 }
3226
3227 virtual size_t GetMBNulLen() const
3228 {
3229 switch ( m_enc )
3230 {
3231 case wxFONTENCODING_UTF16BE:
3232 case wxFONTENCODING_UTF16LE:
3233 return 2;
3234
3235 case wxFONTENCODING_UTF32BE:
3236 case wxFONTENCODING_UTF32LE:
3237 return 4;
3238
3239 default:
3240 return 1;
3241 }
3242 }
3243
3244 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
3245
3246 bool IsOk() const { return m_ok; }
3247
3248 public:
3249 wxFontEncoding m_enc;
3250 wxEncodingConverter m2w, w2m;
3251
3252 private:
3253 // were we initialized successfully?
3254 bool m_ok;
3255
3256 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
3257 };
3258
3259 // make the constructors available for unit testing
3260 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
3261 {
3262 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
3263 if ( !result->IsOk() )
3264 {
3265 delete result;
3266 return 0;
3267 }
3268
3269 return result;
3270 }
3271
3272 #endif // wxUSE_FONTMAP
3273
3274 // ============================================================================
3275 // wxCSConv implementation
3276 // ============================================================================
3277
3278 void wxCSConv::Init()
3279 {
3280 m_name = NULL;
3281 m_convReal = NULL;
3282 m_deferred = true;
3283 }
3284
3285 wxCSConv::wxCSConv(const wxString& charset)
3286 {
3287 Init();
3288
3289 if ( !charset.empty() )
3290 {
3291 SetName(charset.ToAscii());
3292 }
3293
3294 #if wxUSE_FONTMAP
3295 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3296 #else
3297 m_encoding = wxFONTENCODING_SYSTEM;
3298 #endif
3299 }
3300
3301 wxCSConv::wxCSConv(wxFontEncoding encoding)
3302 {
3303 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3304 {
3305 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3306
3307 encoding = wxFONTENCODING_SYSTEM;
3308 }
3309
3310 Init();
3311
3312 m_encoding = encoding;
3313 }
3314
3315 wxCSConv::~wxCSConv()
3316 {
3317 Clear();
3318 }
3319
3320 wxCSConv::wxCSConv(const wxCSConv& conv)
3321 : wxMBConv()
3322 {
3323 Init();
3324
3325 SetName(conv.m_name);
3326 m_encoding = conv.m_encoding;
3327 }
3328
3329 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3330 {
3331 Clear();
3332
3333 SetName(conv.m_name);
3334 m_encoding = conv.m_encoding;
3335
3336 return *this;
3337 }
3338
3339 void wxCSConv::Clear()
3340 {
3341 free(m_name);
3342 delete m_convReal;
3343
3344 m_name = NULL;
3345 m_convReal = NULL;
3346 }
3347
3348 void wxCSConv::SetName(const char *charset)
3349 {
3350 if (charset)
3351 {
3352 m_name = strdup(charset);
3353 m_deferred = true;
3354 }
3355 }
3356
3357 #if wxUSE_FONTMAP
3358
3359 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3360 wxEncodingNameCache );
3361
3362 static wxEncodingNameCache gs_nameCache;
3363 #endif
3364
3365 wxMBConv *wxCSConv::DoCreate() const
3366 {
3367 #if wxUSE_FONTMAP
3368 wxLogTrace(TRACE_STRCONV,
3369 wxT("creating conversion for %s"),
3370 (m_name ? m_name
3371 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3372 #endif // wxUSE_FONTMAP
3373
3374 // check for the special case of ASCII or ISO8859-1 charset: as we have
3375 // special knowledge of it anyhow, we don't need to create a special
3376 // conversion object
3377 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3378 m_encoding == wxFONTENCODING_DEFAULT )
3379 {
3380 // don't convert at all
3381 return NULL;
3382 }
3383
3384 // we trust OS to do conversion better than we can so try external
3385 // conversion methods first
3386 //
3387 // the full order is:
3388 // 1. OS conversion (iconv() under Unix or Win32 API)
3389 // 2. hard coded conversions for UTF
3390 // 3. wxEncodingConverter as fall back
3391
3392 // step (1)
3393 #ifdef HAVE_ICONV
3394 #if !wxUSE_FONTMAP
3395 if ( m_name )
3396 #endif // !wxUSE_FONTMAP
3397 {
3398 #if wxUSE_FONTMAP
3399 wxFontEncoding encoding(m_encoding);
3400 #endif
3401
3402 if ( m_name )
3403 {
3404 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3405 if ( conv->IsOk() )
3406 return conv;
3407
3408 delete conv;
3409
3410 #if wxUSE_FONTMAP
3411 encoding =
3412 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3413 #endif // wxUSE_FONTMAP
3414 }
3415 #if wxUSE_FONTMAP
3416 {
3417 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3418 if ( it != gs_nameCache.end() )
3419 {
3420 if ( it->second.empty() )
3421 return NULL;
3422
3423 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3424 if ( conv->IsOk() )
3425 return conv;
3426
3427 delete conv;
3428 }
3429
3430 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3431 // CS : in case this does not return valid names (eg for MacRoman)
3432 // encoding got a 'failure' entry in the cache all the same,
3433 // although it just has to be created using a different method, so
3434 // only store failed iconv creation attempts (or perhaps we
3435 // shoulnd't do this at all ?)
3436 if ( names[0] != NULL )
3437 {
3438 for ( ; *names; ++names )
3439 {
3440 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3441 // will need changes that will obsolete this
3442 wxString name(*names);
3443 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3444 if ( conv->IsOk() )
3445 {
3446 gs_nameCache[encoding] = *names;
3447 return conv;
3448 }
3449
3450 delete conv;
3451 }
3452
3453 gs_nameCache[encoding] = _T(""); // cache the failure
3454 }
3455 }
3456 #endif // wxUSE_FONTMAP
3457 }
3458 #endif // HAVE_ICONV
3459
3460 #ifdef wxHAVE_WIN32_MB2WC
3461 {
3462 #if wxUSE_FONTMAP
3463 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3464 : new wxMBConv_win32(m_encoding);
3465 if ( conv->IsOk() )
3466 return conv;
3467
3468 delete conv;
3469 #else
3470 return NULL;
3471 #endif
3472 }
3473 #endif // wxHAVE_WIN32_MB2WC
3474
3475 #if defined(__WXMAC__)
3476 {
3477 // leave UTF16 and UTF32 to the built-ins of wx
3478 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3479 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3480 {
3481 #if wxUSE_FONTMAP
3482 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3483 : new wxMBConv_mac(m_encoding);
3484 #else
3485 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3486 #endif
3487 if ( conv->IsOk() )
3488 return conv;
3489
3490 delete conv;
3491 }
3492 }
3493 #endif
3494
3495 #ifdef __DARWIN__
3496 {
3497 // leave UTF16 and UTF32 to the built-ins of wx
3498 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3499 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3500 {
3501 #if wxUSE_FONTMAP
3502 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3503 : new wxMBConv_cf(m_encoding);
3504 #else
3505 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3506 #endif
3507
3508 if ( conv->IsOk() )
3509 return conv;
3510
3511 delete conv;
3512 }
3513 }
3514 #endif // __DARWIN__
3515
3516 // step (2)
3517 wxFontEncoding enc = m_encoding;
3518 #if wxUSE_FONTMAP
3519 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3520 {
3521 // use "false" to suppress interactive dialogs -- we can be called from
3522 // anywhere and popping up a dialog from here is the last thing we want to
3523 // do
3524 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3525 }
3526 #endif // wxUSE_FONTMAP
3527
3528 switch ( enc )
3529 {
3530 case wxFONTENCODING_UTF7:
3531 return new wxMBConvUTF7;
3532
3533 case wxFONTENCODING_UTF8:
3534 return new wxMBConvUTF8;
3535
3536 case wxFONTENCODING_UTF16BE:
3537 return new wxMBConvUTF16BE;
3538
3539 case wxFONTENCODING_UTF16LE:
3540 return new wxMBConvUTF16LE;
3541
3542 case wxFONTENCODING_UTF32BE:
3543 return new wxMBConvUTF32BE;
3544
3545 case wxFONTENCODING_UTF32LE:
3546 return new wxMBConvUTF32LE;
3547
3548 default:
3549 // nothing to do but put here to suppress gcc warnings
3550 break;
3551 }
3552
3553 // step (3)
3554 #if wxUSE_FONTMAP
3555 {
3556 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3557 : new wxMBConv_wxwin(m_encoding);
3558 if ( conv->IsOk() )
3559 return conv;
3560
3561 delete conv;
3562 }
3563 #endif // wxUSE_FONTMAP
3564
3565 // NB: This is a hack to prevent deadlock. What could otherwise happen
3566 // in Unicode build: wxConvLocal creation ends up being here
3567 // because of some failure and logs the error. But wxLog will try to
3568 // attach a timestamp, for which it will need wxConvLocal (to convert
3569 // time to char* and then wchar_t*), but that fails, tries to log the
3570 // error, but wxLog has an (already locked) critical section that
3571 // guards the static buffer.
3572 static bool alreadyLoggingError = false;
3573 if (!alreadyLoggingError)
3574 {
3575 alreadyLoggingError = true;
3576 wxLogError(_("Cannot convert from the charset '%s'!"),
3577 m_name ? m_name
3578 :
3579 #if wxUSE_FONTMAP
3580 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3581 #else // !wxUSE_FONTMAP
3582 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3583 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3584 );
3585
3586 alreadyLoggingError = false;
3587 }
3588
3589 return NULL;
3590 }
3591
3592 void wxCSConv::CreateConvIfNeeded() const
3593 {
3594 if ( m_deferred )
3595 {
3596 wxCSConv *self = (wxCSConv *)this; // const_cast
3597
3598 // if we don't have neither the name nor the encoding, use the default
3599 // encoding for this system
3600 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3601 {
3602 #if wxUSE_INTL
3603 self->m_encoding = wxLocale::GetSystemEncoding();
3604 #else
3605 // fallback to some reasonable default:
3606 self->m_encoding = wxFONTENCODING_ISO8859_1;
3607 #endif // wxUSE_INTL
3608 }
3609
3610 self->m_convReal = DoCreate();
3611 self->m_deferred = false;
3612 }
3613 }
3614
3615 bool wxCSConv::IsOk() const
3616 {
3617 CreateConvIfNeeded();
3618
3619 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3620 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3621 return true; // always ok as we do it ourselves
3622
3623 // m_convReal->IsOk() is called at its own creation, so we know it must
3624 // be ok if m_convReal is non-NULL
3625 return m_convReal != NULL;
3626 }
3627
3628 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3629 const char *src, size_t srcLen) const
3630 {
3631 CreateConvIfNeeded();
3632
3633 if (m_convReal)
3634 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3635
3636 // latin-1 (direct)
3637 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3638 }
3639
3640 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3641 const wchar_t *src, size_t srcLen) const
3642 {
3643 CreateConvIfNeeded();
3644
3645 if (m_convReal)
3646 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3647
3648 // latin-1 (direct)
3649 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3650 }
3651
3652 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3653 {
3654 CreateConvIfNeeded();
3655
3656 if (m_convReal)
3657 return m_convReal->MB2WC(buf, psz, n);
3658
3659 // latin-1 (direct)
3660 size_t len = strlen(psz);
3661
3662 if (buf)
3663 {
3664 for (size_t c = 0; c <= len; c++)
3665 buf[c] = (unsigned char)(psz[c]);
3666 }
3667
3668 return len;
3669 }
3670
3671 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3672 {
3673 CreateConvIfNeeded();
3674
3675 if (m_convReal)
3676 return m_convReal->WC2MB(buf, psz, n);
3677
3678 // latin-1 (direct)
3679 const size_t len = wxWcslen(psz);
3680 if (buf)
3681 {
3682 for (size_t c = 0; c <= len; c++)
3683 {
3684 if (psz[c] > 0xFF)
3685 return wxCONV_FAILED;
3686
3687 buf[c] = (char)psz[c];
3688 }
3689 }
3690 else
3691 {
3692 for (size_t c = 0; c <= len; c++)
3693 {
3694 if (psz[c] > 0xFF)
3695 return wxCONV_FAILED;
3696 }
3697 }
3698
3699 return len;
3700 }
3701
3702 size_t wxCSConv::GetMBNulLen() const
3703 {
3704 CreateConvIfNeeded();
3705
3706 if ( m_convReal )
3707 {
3708 return m_convReal->GetMBNulLen();
3709 }
3710
3711 // otherwise, we are ISO-8859-1
3712 return 1;
3713 }
3714
3715 #if wxUSE_UNICODE_UTF8
3716 bool wxCSConv::IsUTF8() const
3717 {
3718 CreateConvIfNeeded();
3719
3720 if ( m_convReal )
3721 {
3722 return m_convReal->IsUTF8();
3723 }
3724
3725 // otherwise, we are ISO-8859-1
3726 return false;
3727 }
3728 #endif
3729
3730
3731 #if wxUSE_UNICODE
3732
3733 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3734 {
3735 if ( !s )
3736 return wxWCharBuffer();
3737
3738 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3739 if ( !wbuf )
3740 wbuf = wxMBConvUTF8().cMB2WX(s);
3741 if ( !wbuf )
3742 wbuf = wxConvISO8859_1.cMB2WX(s);
3743
3744 return wbuf;
3745 }
3746
3747 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3748 {
3749 if ( !ws )
3750 return wxCharBuffer();
3751
3752 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3753 if ( !buf )
3754 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3755
3756 return buf;
3757 }
3758
3759 #endif // wxUSE_UNICODE
3760
3761 // ----------------------------------------------------------------------------
3762 // globals
3763 // ----------------------------------------------------------------------------
3764
3765 // NB: The reason why we create converted objects in this convoluted way,
3766 // using a factory function instead of global variable, is that they
3767 // may be used at static initialization time (some of them are used by
3768 // wxString ctors and there may be a global wxString object). In other
3769 // words, possibly _before_ the converter global object would be
3770 // initialized.
3771
3772 #undef wxConvLibc
3773 #undef wxConvUTF8
3774 #undef wxConvUTF7
3775 #undef wxConvLocal
3776 #undef wxConvISO8859_1
3777
3778 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3779 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3780 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3781 { \
3782 static impl_klass name##Obj ctor_args; \
3783 return &name##Obj; \
3784 } \
3785 /* this ensures that all global converter objects are created */ \
3786 /* by the time static initialization is done, i.e. before any */ \
3787 /* thread is launched: */ \
3788 static klass* gs_##name##instance = wxGet_##name##Ptr()
3789
3790 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3791 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3792
3793 #ifdef __WINDOWS__
3794 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3795 #elif defined(__WXMAC__) && !defined(__MACH__)
3796 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_mac, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3797 #else
3798 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3799 #endif
3800
3801 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3802 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3803
3804 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3805 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3806
3807 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3808 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3809
3810 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3811 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3812 #endif
3813 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3814 #ifdef __WXOSX__
3815 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3816 &wxConvMacUTF8DObj;
3817 #else
3818 wxGet_wxConvUTF8Ptr();
3819 #endif
3820 #else // !__WXOSX__
3821 wxGet_wxConvLibcPtr();
3822 #endif // __WXOSX__/!__WXOSX__
3823
3824 #else // !wxUSE_WCHAR_T
3825
3826 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3827 // stand-ins in absence of wchar_t
3828 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3829 wxConvISO8859_1,
3830 wxConvLocal,
3831 wxConvUTF8;
3832
3833 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T