]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Rename wxMBConv_cocoa to wxMBConv_cf and use it when __DARWIN__ (all Darwin and OS...
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef __SALFORDC__
48 #include <clib.h>
49 #endif
50
51 #ifdef HAVE_ICONV
52 #include <iconv.h>
53 #include "wx/thread.h"
54 #endif
55
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
58
59 #ifdef __DARWIN__
60 #include <CoreFoundation/CFString.h>
61 #include <CoreFoundation/CFStringEncodingExt.h>
62 #endif //def __DARWIN__
63
64 #ifdef __WXMAC__
65 #ifndef __DARWIN__
66 #include <ATSUnicode.h>
67 #include <TextCommon.h>
68 #include <TextEncodingConverter.h>
69 #endif
70
71 // includes Mac headers
72 #include "wx/mac/private.h"
73 #endif
74
75
76 #define TRACE_STRCONV _T("strconv")
77
78 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
79 // be 4 bytes
80 #if SIZEOF_WCHAR_T == 2
81 #define WC_UTF16
82 #endif
83
84
85 // ============================================================================
86 // implementation
87 // ============================================================================
88
89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
90 static bool NotAllNULs(const char *p, size_t n)
91 {
92 while ( n && *p++ == '\0' )
93 n--;
94
95 return n != 0;
96 }
97
98 // ----------------------------------------------------------------------------
99 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
100 // ----------------------------------------------------------------------------
101
102 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
103 {
104 if (input <= 0xffff)
105 {
106 if (output)
107 *output = (wxUint16) input;
108
109 return 1;
110 }
111 else if (input >= 0x110000)
112 {
113 return wxCONV_FAILED;
114 }
115 else
116 {
117 if (output)
118 {
119 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
120 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
121 }
122
123 return 2;
124 }
125 }
126
127 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
128 {
129 if ((*input < 0xd800) || (*input > 0xdfff))
130 {
131 output = *input;
132 return 1;
133 }
134 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
135 {
136 output = *input;
137 return wxCONV_FAILED;
138 }
139 else
140 {
141 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
142 return 2;
143 }
144 }
145
146 #ifdef WC_UTF16
147 typedef wchar_t wxDecodeSurrogate_t;
148 #else // !WC_UTF16
149 typedef wxUint16 wxDecodeSurrogate_t;
150 #endif // WC_UTF16/!WC_UTF16
151
152 // returns the next UTF-32 character from the wchar_t buffer and advances the
153 // pointer to the character after this one
154 //
155 // if an invalid character is found, *pSrc is set to NULL, the caller must
156 // check for this
157 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
158 {
159 wxUint32 out;
160 const size_t
161 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
162 if ( n == wxCONV_FAILED )
163 *pSrc = NULL;
164 else
165 *pSrc += n;
166
167 return out;
168 }
169
170 // ----------------------------------------------------------------------------
171 // wxMBConv
172 // ----------------------------------------------------------------------------
173
174 size_t
175 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
176 const char *src, size_t srcLen) const
177 {
178 // although new conversion classes are supposed to implement this function
179 // directly, the existins ones only implement the old MB2WC() and so, to
180 // avoid to have to rewrite all conversion classes at once, we provide a
181 // default (but not efficient) implementation of this one in terms of the
182 // old function by copying the input to ensure that it's NUL-terminated and
183 // then using MB2WC() to convert it
184
185 // the number of chars [which would be] written to dst [if it were not NULL]
186 size_t dstWritten = 0;
187
188 // the number of NULs terminating this string
189 size_t nulLen = 0; // not really needed, but just to avoid warnings
190
191 // if we were not given the input size we just have to assume that the
192 // string is properly terminated as we have no way of knowing how long it
193 // is anyhow, but if we do have the size check whether there are enough
194 // NULs at the end
195 wxCharBuffer bufTmp;
196 const char *srcEnd;
197 if ( srcLen != wxNO_LEN )
198 {
199 // we need to know how to find the end of this string
200 nulLen = GetMBNulLen();
201 if ( nulLen == wxCONV_FAILED )
202 return wxCONV_FAILED;
203
204 // if there are enough NULs we can avoid the copy
205 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
206 {
207 // make a copy in order to properly NUL-terminate the string
208 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
209 char * const p = bufTmp.data();
210 memcpy(p, src, srcLen);
211 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
212 *s = '\0';
213
214 src = bufTmp;
215 }
216
217 srcEnd = src + srcLen;
218 }
219 else // quit after the first loop iteration
220 {
221 srcEnd = NULL;
222 }
223
224 for ( ;; )
225 {
226 // try to convert the current chunk
227 size_t lenChunk = MB2WC(NULL, src, 0);
228 if ( lenChunk == wxCONV_FAILED )
229 return wxCONV_FAILED;
230
231 lenChunk++; // for the L'\0' at the end of this chunk
232
233 dstWritten += lenChunk;
234
235 if ( lenChunk == 1 )
236 {
237 // nothing left in the input string, conversion succeeded
238 break;
239 }
240
241 if ( dst )
242 {
243 if ( dstWritten > dstLen )
244 return wxCONV_FAILED;
245
246 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
247 return wxCONV_FAILED;
248
249 dst += lenChunk;
250 }
251
252 if ( !srcEnd )
253 {
254 // we convert just one chunk in this case as this is the entire
255 // string anyhow
256 break;
257 }
258
259 // advance the input pointer past the end of this chunk
260 while ( NotAllNULs(src, nulLen) )
261 {
262 // notice that we must skip over multiple bytes here as we suppose
263 // that if NUL takes 2 or 4 bytes, then all the other characters do
264 // too and so if advanced by a single byte we might erroneously
265 // detect sequences of NUL bytes in the middle of the input
266 src += nulLen;
267 }
268
269 src += nulLen; // skipping over its terminator as well
270
271 // note that ">=" (and not just "==") is needed here as the terminator
272 // we skipped just above could be inside or just after the buffer
273 // delimited by inEnd
274 if ( src >= srcEnd )
275 break;
276 }
277
278 return dstWritten;
279 }
280
281 size_t
282 wxMBConv::FromWChar(char *dst, size_t dstLen,
283 const wchar_t *src, size_t srcLen) const
284 {
285 // the number of chars [which would be] written to dst [if it were not NULL]
286 size_t dstWritten = 0;
287
288 // make a copy of the input string unless it is already properly
289 // NUL-terminated
290 //
291 // if we don't know its length we have no choice but to assume that it is,
292 // indeed, properly terminated
293 wxWCharBuffer bufTmp;
294 if ( srcLen == wxNO_LEN )
295 {
296 srcLen = wxWcslen(src) + 1;
297 }
298 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
299 {
300 // make a copy in order to properly NUL-terminate the string
301 bufTmp = wxWCharBuffer(srcLen);
302 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
303 src = bufTmp;
304 }
305
306 const size_t lenNul = GetMBNulLen();
307 for ( const wchar_t * const srcEnd = src + srcLen;
308 src < srcEnd;
309 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
310 {
311 // try to convert the current chunk
312 size_t lenChunk = WC2MB(NULL, src, 0);
313
314 if ( lenChunk == wxCONV_FAILED )
315 return wxCONV_FAILED;
316
317 lenChunk += lenNul;
318 dstWritten += lenChunk;
319
320 if ( dst )
321 {
322 if ( dstWritten > dstLen )
323 return wxCONV_FAILED;
324
325 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
326 return wxCONV_FAILED;
327
328 dst += lenChunk;
329 }
330 }
331
332 return dstWritten;
333 }
334
335 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
336 {
337 size_t rc = ToWChar(outBuff, outLen, inBuff);
338 if ( rc != wxCONV_FAILED )
339 {
340 // ToWChar() returns the buffer length, i.e. including the trailing
341 // NUL, while this method doesn't take it into account
342 rc--;
343 }
344
345 return rc;
346 }
347
348 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
349 {
350 size_t rc = FromWChar(outBuff, outLen, inBuff);
351 if ( rc != wxCONV_FAILED )
352 {
353 rc -= GetMBNulLen();
354 }
355
356 return rc;
357 }
358
359 wxMBConv::~wxMBConv()
360 {
361 // nothing to do here (necessary for Darwin linking probably)
362 }
363
364 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
365 {
366 if ( psz )
367 {
368 // calculate the length of the buffer needed first
369 const size_t nLen = MB2WC(NULL, psz, 0);
370 if ( nLen != wxCONV_FAILED )
371 {
372 // now do the actual conversion
373 wxWCharBuffer buf(nLen /* +1 added implicitly */);
374
375 // +1 for the trailing NULL
376 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
377 return buf;
378 }
379 }
380
381 return wxWCharBuffer();
382 }
383
384 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
385 {
386 if ( pwz )
387 {
388 const size_t nLen = WC2MB(NULL, pwz, 0);
389 if ( nLen != wxCONV_FAILED )
390 {
391 // extra space for trailing NUL(s)
392 static const size_t extraLen = GetMaxMBNulLen();
393
394 wxCharBuffer buf(nLen + extraLen - 1);
395 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
396 return buf;
397 }
398 }
399
400 return wxCharBuffer();
401 }
402
403 const wxWCharBuffer
404 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
405 {
406 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
407 if ( dstLen != wxCONV_FAILED )
408 {
409 wxWCharBuffer wbuf(dstLen - 1);
410 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
411 {
412 if ( outLen )
413 {
414 *outLen = dstLen;
415 if ( wbuf[dstLen - 1] == L'\0' )
416 (*outLen)--;
417 }
418
419 return wbuf;
420 }
421 }
422
423 if ( outLen )
424 *outLen = 0;
425
426 return wxWCharBuffer();
427 }
428
429 const wxCharBuffer
430 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
431 {
432 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
433 if ( dstLen != wxCONV_FAILED )
434 {
435 // special case of empty input: can't allocate 0 size buffer below as
436 // wxCharBuffer insists on NUL-terminating it
437 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
438 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
439 {
440 if ( outLen )
441 {
442 *outLen = dstLen;
443
444 const size_t nulLen = GetMBNulLen();
445 if ( dstLen >= nulLen &&
446 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
447 {
448 // in this case the output is NUL-terminated and we're not
449 // supposed to count NUL
450 *outLen -= nulLen;
451 }
452 }
453
454 return buf;
455 }
456 }
457
458 if ( outLen )
459 *outLen = 0;
460
461 return wxCharBuffer();
462 }
463
464 // ----------------------------------------------------------------------------
465 // wxMBConvLibc
466 // ----------------------------------------------------------------------------
467
468 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
469 {
470 return wxMB2WC(buf, psz, n);
471 }
472
473 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
474 {
475 return wxWC2MB(buf, psz, n);
476 }
477
478 // ----------------------------------------------------------------------------
479 // wxConvBrokenFileNames
480 // ----------------------------------------------------------------------------
481
482 #ifdef __UNIX__
483
484 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
485 {
486 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
487 wxStricmp(charset, _T("UTF8")) == 0 )
488 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
489 else
490 m_conv = new wxCSConv(charset);
491 }
492
493 #endif // __UNIX__
494
495 // ----------------------------------------------------------------------------
496 // UTF-7
497 // ----------------------------------------------------------------------------
498
499 // Implementation (C) 2004 Fredrik Roubert
500
501 //
502 // BASE64 decoding table
503 //
504 static const unsigned char utf7unb64[] =
505 {
506 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
512 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
513 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
515 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
516 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
517 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
519 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
520 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
521 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
536 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
538 };
539
540 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
541 {
542 size_t len = 0;
543
544 while ( *psz && (!buf || (len < n)) )
545 {
546 unsigned char cc = *psz++;
547 if (cc != '+')
548 {
549 // plain ASCII char
550 if (buf)
551 *buf++ = cc;
552 len++;
553 }
554 else if (*psz == '-')
555 {
556 // encoded plus sign
557 if (buf)
558 *buf++ = cc;
559 len++;
560 psz++;
561 }
562 else // start of BASE64 encoded string
563 {
564 bool lsb, ok;
565 unsigned int d, l;
566 for ( ok = lsb = false, d = 0, l = 0;
567 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
568 psz++ )
569 {
570 d <<= 6;
571 d += cc;
572 for (l += 6; l >= 8; lsb = !lsb)
573 {
574 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
575 if (lsb)
576 {
577 if (buf)
578 *buf++ |= c;
579 len ++;
580 }
581 else
582 {
583 if (buf)
584 *buf = (wchar_t)(c << 8);
585 }
586
587 ok = true;
588 }
589 }
590
591 if ( !ok )
592 {
593 // in valid UTF7 we should have valid characters after '+'
594 return wxCONV_FAILED;
595 }
596
597 if (*psz == '-')
598 psz++;
599 }
600 }
601
602 if ( buf && (len < n) )
603 *buf = '\0';
604
605 return len;
606 }
607
608 //
609 // BASE64 encoding table
610 //
611 static const unsigned char utf7enb64[] =
612 {
613 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
614 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
615 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
616 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
617 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
618 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
619 'w', 'x', 'y', 'z', '0', '1', '2', '3',
620 '4', '5', '6', '7', '8', '9', '+', '/'
621 };
622
623 //
624 // UTF-7 encoding table
625 //
626 // 0 - Set D (directly encoded characters)
627 // 1 - Set O (optional direct characters)
628 // 2 - whitespace characters (optional)
629 // 3 - special characters
630 //
631 static const unsigned char utf7encode[128] =
632 {
633 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
634 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
635 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
637 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
639 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
641 };
642
643 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
644 {
645 size_t len = 0;
646
647 while (*psz && ((!buf) || (len < n)))
648 {
649 wchar_t cc = *psz++;
650 if (cc < 0x80 && utf7encode[cc] < 1)
651 {
652 // plain ASCII char
653 if (buf)
654 *buf++ = (char)cc;
655
656 len++;
657 }
658 #ifndef WC_UTF16
659 else if (((wxUint32)cc) > 0xffff)
660 {
661 // no surrogate pair generation (yet?)
662 return wxCONV_FAILED;
663 }
664 #endif
665 else
666 {
667 if (buf)
668 *buf++ = '+';
669
670 len++;
671 if (cc != '+')
672 {
673 // BASE64 encode string
674 unsigned int lsb, d, l;
675 for (d = 0, l = 0; /*nothing*/; psz++)
676 {
677 for (lsb = 0; lsb < 2; lsb ++)
678 {
679 d <<= 8;
680 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
681
682 for (l += 8; l >= 6; )
683 {
684 l -= 6;
685 if (buf)
686 *buf++ = utf7enb64[(d >> l) % 64];
687 len++;
688 }
689 }
690
691 cc = *psz;
692 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
693 break;
694 }
695
696 if (l != 0)
697 {
698 if (buf)
699 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
700
701 len++;
702 }
703 }
704
705 if (buf)
706 *buf++ = '-';
707 len++;
708 }
709 }
710
711 if (buf && (len < n))
712 *buf = 0;
713
714 return len;
715 }
716
717 // ----------------------------------------------------------------------------
718 // UTF-8
719 // ----------------------------------------------------------------------------
720
721 static wxUint32 utf8_max[]=
722 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
723
724 // boundaries of the private use area we use to (temporarily) remap invalid
725 // characters invalid in a UTF-8 encoded string
726 const wxUint32 wxUnicodePUA = 0x100000;
727 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
728
729 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
730 {
731 size_t len = 0;
732
733 while (*psz && ((!buf) || (len < n)))
734 {
735 const char *opsz = psz;
736 bool invalid = false;
737 unsigned char cc = *psz++, fc = cc;
738 unsigned cnt;
739 for (cnt = 0; fc & 0x80; cnt++)
740 fc <<= 1;
741
742 if (!cnt)
743 {
744 // plain ASCII char
745 if (buf)
746 *buf++ = cc;
747 len++;
748
749 // escape the escape character for octal escapes
750 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
751 && cc == '\\' && (!buf || len < n))
752 {
753 if (buf)
754 *buf++ = cc;
755 len++;
756 }
757 }
758 else
759 {
760 cnt--;
761 if (!cnt)
762 {
763 // invalid UTF-8 sequence
764 invalid = true;
765 }
766 else
767 {
768 unsigned ocnt = cnt - 1;
769 wxUint32 res = cc & (0x3f >> cnt);
770 while (cnt--)
771 {
772 cc = *psz;
773 if ((cc & 0xC0) != 0x80)
774 {
775 // invalid UTF-8 sequence
776 invalid = true;
777 break;
778 }
779
780 psz++;
781 res = (res << 6) | (cc & 0x3f);
782 }
783
784 if (invalid || res <= utf8_max[ocnt])
785 {
786 // illegal UTF-8 encoding
787 invalid = true;
788 }
789 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
790 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
791 {
792 // if one of our PUA characters turns up externally
793 // it must also be treated as an illegal sequence
794 // (a bit like you have to escape an escape character)
795 invalid = true;
796 }
797 else
798 {
799 #ifdef WC_UTF16
800 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
801 size_t pa = encode_utf16(res, (wxUint16 *)buf);
802 if (pa == wxCONV_FAILED)
803 {
804 invalid = true;
805 }
806 else
807 {
808 if (buf)
809 buf += pa;
810 len += pa;
811 }
812 #else // !WC_UTF16
813 if (buf)
814 *buf++ = (wchar_t)res;
815 len++;
816 #endif // WC_UTF16/!WC_UTF16
817 }
818 }
819
820 if (invalid)
821 {
822 if (m_options & MAP_INVALID_UTF8_TO_PUA)
823 {
824 while (opsz < psz && (!buf || len < n))
825 {
826 #ifdef WC_UTF16
827 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
828 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
829 wxASSERT(pa != wxCONV_FAILED);
830 if (buf)
831 buf += pa;
832 opsz++;
833 len += pa;
834 #else
835 if (buf)
836 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
837 opsz++;
838 len++;
839 #endif
840 }
841 }
842 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
843 {
844 while (opsz < psz && (!buf || len < n))
845 {
846 if ( buf && len + 3 < n )
847 {
848 unsigned char on = *opsz;
849 *buf++ = L'\\';
850 *buf++ = (wchar_t)( L'0' + on / 0100 );
851 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
852 *buf++ = (wchar_t)( L'0' + on % 010 );
853 }
854
855 opsz++;
856 len += 4;
857 }
858 }
859 else // MAP_INVALID_UTF8_NOT
860 {
861 return wxCONV_FAILED;
862 }
863 }
864 }
865 }
866
867 if (buf && (len < n))
868 *buf = 0;
869
870 return len;
871 }
872
873 static inline bool isoctal(wchar_t wch)
874 {
875 return L'0' <= wch && wch <= L'7';
876 }
877
878 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
879 {
880 size_t len = 0;
881
882 while (*psz && ((!buf) || (len < n)))
883 {
884 wxUint32 cc;
885
886 #ifdef WC_UTF16
887 // cast is ok for WC_UTF16
888 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
889 psz += (pa == wxCONV_FAILED) ? 1 : pa;
890 #else
891 cc = (*psz++) & 0x7fffffff;
892 #endif
893
894 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
895 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
896 {
897 if (buf)
898 *buf++ = (char)(cc - wxUnicodePUA);
899 len++;
900 }
901 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
902 && cc == L'\\' && psz[0] == L'\\' )
903 {
904 if (buf)
905 *buf++ = (char)cc;
906 psz++;
907 len++;
908 }
909 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
910 cc == L'\\' &&
911 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
912 {
913 if (buf)
914 {
915 *buf++ = (char) ((psz[0] - L'0') * 0100 +
916 (psz[1] - L'0') * 010 +
917 (psz[2] - L'0'));
918 }
919
920 psz += 3;
921 len++;
922 }
923 else
924 {
925 unsigned cnt;
926 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
927 {
928 }
929
930 if (!cnt)
931 {
932 // plain ASCII char
933 if (buf)
934 *buf++ = (char) cc;
935 len++;
936 }
937 else
938 {
939 len += cnt + 1;
940 if (buf)
941 {
942 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
943 while (cnt--)
944 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
945 }
946 }
947 }
948 }
949
950 if (buf && (len < n))
951 *buf = 0;
952
953 return len;
954 }
955
956 // ============================================================================
957 // UTF-16
958 // ============================================================================
959
960 #ifdef WORDS_BIGENDIAN
961 #define wxMBConvUTF16straight wxMBConvUTF16BE
962 #define wxMBConvUTF16swap wxMBConvUTF16LE
963 #else
964 #define wxMBConvUTF16swap wxMBConvUTF16BE
965 #define wxMBConvUTF16straight wxMBConvUTF16LE
966 #endif
967
968 /* static */
969 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
970 {
971 if ( srcLen == wxNO_LEN )
972 {
973 // count the number of bytes in input, including the trailing NULs
974 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
975 for ( srcLen = 1; *inBuff++; srcLen++ )
976 ;
977
978 srcLen *= BYTES_PER_CHAR;
979 }
980 else // we already have the length
981 {
982 // we can only convert an entire number of UTF-16 characters
983 if ( srcLen % BYTES_PER_CHAR )
984 return wxCONV_FAILED;
985 }
986
987 return srcLen;
988 }
989
990 // case when in-memory representation is UTF-16 too
991 #ifdef WC_UTF16
992
993 // ----------------------------------------------------------------------------
994 // conversions without endianness change
995 // ----------------------------------------------------------------------------
996
997 size_t
998 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
999 const char *src, size_t srcLen) const
1000 {
1001 // set up the scene for using memcpy() (which is presumably more efficient
1002 // than copying the bytes one by one)
1003 srcLen = GetLength(src, srcLen);
1004 if ( srcLen == wxNO_LEN )
1005 return wxCONV_FAILED;
1006
1007 const size_t inLen = srcLen / BYTES_PER_CHAR;
1008 if ( dst )
1009 {
1010 if ( dstLen < inLen )
1011 return wxCONV_FAILED;
1012
1013 memcpy(dst, src, srcLen);
1014 }
1015
1016 return inLen;
1017 }
1018
1019 size_t
1020 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1021 const wchar_t *src, size_t srcLen) const
1022 {
1023 if ( srcLen == wxNO_LEN )
1024 srcLen = wxWcslen(src) + 1;
1025
1026 srcLen *= BYTES_PER_CHAR;
1027
1028 if ( dst )
1029 {
1030 if ( dstLen < srcLen )
1031 return wxCONV_FAILED;
1032
1033 memcpy(dst, src, srcLen);
1034 }
1035
1036 return srcLen;
1037 }
1038
1039 // ----------------------------------------------------------------------------
1040 // endian-reversing conversions
1041 // ----------------------------------------------------------------------------
1042
1043 size_t
1044 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1045 const char *src, size_t srcLen) const
1046 {
1047 srcLen = GetLength(src, srcLen);
1048 if ( srcLen == wxNO_LEN )
1049 return wxCONV_FAILED;
1050
1051 srcLen /= BYTES_PER_CHAR;
1052
1053 if ( dst )
1054 {
1055 if ( dstLen < srcLen )
1056 return wxCONV_FAILED;
1057
1058 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1059 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1060 {
1061 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1062 }
1063 }
1064
1065 return srcLen;
1066 }
1067
1068 size_t
1069 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1070 const wchar_t *src, size_t srcLen) const
1071 {
1072 if ( srcLen == wxNO_LEN )
1073 srcLen = wxWcslen(src) + 1;
1074
1075 srcLen *= BYTES_PER_CHAR;
1076
1077 if ( dst )
1078 {
1079 if ( dstLen < srcLen )
1080 return wxCONV_FAILED;
1081
1082 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1083 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1084 {
1085 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1086 }
1087 }
1088
1089 return srcLen;
1090 }
1091
1092 #else // !WC_UTF16: wchar_t is UTF-32
1093
1094 // ----------------------------------------------------------------------------
1095 // conversions without endianness change
1096 // ----------------------------------------------------------------------------
1097
1098 size_t
1099 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1100 const char *src, size_t srcLen) const
1101 {
1102 srcLen = GetLength(src, srcLen);
1103 if ( srcLen == wxNO_LEN )
1104 return wxCONV_FAILED;
1105
1106 const size_t inLen = srcLen / BYTES_PER_CHAR;
1107 if ( !dst )
1108 {
1109 // optimization: return maximal space which could be needed for this
1110 // string even if the real size could be smaller if the buffer contains
1111 // any surrogates
1112 return inLen;
1113 }
1114
1115 size_t outLen = 0;
1116 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1117 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1118 {
1119 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1120 if ( !inBuff )
1121 return wxCONV_FAILED;
1122
1123 if ( ++outLen > dstLen )
1124 return wxCONV_FAILED;
1125
1126 *dst++ = ch;
1127 }
1128
1129
1130 return outLen;
1131 }
1132
1133 size_t
1134 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1135 const wchar_t *src, size_t srcLen) const
1136 {
1137 if ( srcLen == wxNO_LEN )
1138 srcLen = wxWcslen(src) + 1;
1139
1140 size_t outLen = 0;
1141 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1142 for ( size_t n = 0; n < srcLen; n++ )
1143 {
1144 wxUint16 cc[2];
1145 const size_t numChars = encode_utf16(*src++, cc);
1146 if ( numChars == wxCONV_FAILED )
1147 return wxCONV_FAILED;
1148
1149 outLen += numChars * BYTES_PER_CHAR;
1150 if ( outBuff )
1151 {
1152 if ( outLen > dstLen )
1153 return wxCONV_FAILED;
1154
1155 *outBuff++ = cc[0];
1156 if ( numChars == 2 )
1157 {
1158 // second character of a surrogate
1159 *outBuff++ = cc[1];
1160 }
1161 }
1162 }
1163
1164 return outLen;
1165 }
1166
1167 // ----------------------------------------------------------------------------
1168 // endian-reversing conversions
1169 // ----------------------------------------------------------------------------
1170
1171 size_t
1172 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1173 const char *src, size_t srcLen) const
1174 {
1175 srcLen = GetLength(src, srcLen);
1176 if ( srcLen == wxNO_LEN )
1177 return wxCONV_FAILED;
1178
1179 const size_t inLen = srcLen / BYTES_PER_CHAR;
1180 if ( !dst )
1181 {
1182 // optimization: return maximal space which could be needed for this
1183 // string even if the real size could be smaller if the buffer contains
1184 // any surrogates
1185 return inLen;
1186 }
1187
1188 size_t outLen = 0;
1189 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1190 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1191 {
1192 wxUint32 ch;
1193 wxUint16 tmp[2];
1194
1195 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1196 inBuff++;
1197 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1198
1199 const size_t numChars = decode_utf16(tmp, ch);
1200 if ( numChars == wxCONV_FAILED )
1201 return wxCONV_FAILED;
1202
1203 if ( numChars == 2 )
1204 inBuff++;
1205
1206 if ( ++outLen > dstLen )
1207 return wxCONV_FAILED;
1208
1209 *dst++ = ch;
1210 }
1211
1212
1213 return outLen;
1214 }
1215
1216 size_t
1217 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1218 const wchar_t *src, size_t srcLen) const
1219 {
1220 if ( srcLen == wxNO_LEN )
1221 srcLen = wxWcslen(src) + 1;
1222
1223 size_t outLen = 0;
1224 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1225 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1226 {
1227 wxUint16 cc[2];
1228 const size_t numChars = encode_utf16(*src, cc);
1229 if ( numChars == wxCONV_FAILED )
1230 return wxCONV_FAILED;
1231
1232 outLen += numChars * BYTES_PER_CHAR;
1233 if ( outBuff )
1234 {
1235 if ( outLen > dstLen )
1236 return wxCONV_FAILED;
1237
1238 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1239 if ( numChars == 2 )
1240 {
1241 // second character of a surrogate
1242 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1243 }
1244 }
1245 }
1246
1247 return outLen;
1248 }
1249
1250 #endif // WC_UTF16/!WC_UTF16
1251
1252
1253 // ============================================================================
1254 // UTF-32
1255 // ============================================================================
1256
1257 #ifdef WORDS_BIGENDIAN
1258 #define wxMBConvUTF32straight wxMBConvUTF32BE
1259 #define wxMBConvUTF32swap wxMBConvUTF32LE
1260 #else
1261 #define wxMBConvUTF32swap wxMBConvUTF32BE
1262 #define wxMBConvUTF32straight wxMBConvUTF32LE
1263 #endif
1264
1265
1266 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1267 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1268
1269 /* static */
1270 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1271 {
1272 if ( srcLen == wxNO_LEN )
1273 {
1274 // count the number of bytes in input, including the trailing NULs
1275 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1276 for ( srcLen = 1; *inBuff++; srcLen++ )
1277 ;
1278
1279 srcLen *= BYTES_PER_CHAR;
1280 }
1281 else // we already have the length
1282 {
1283 // we can only convert an entire number of UTF-32 characters
1284 if ( srcLen % BYTES_PER_CHAR )
1285 return wxCONV_FAILED;
1286 }
1287
1288 return srcLen;
1289 }
1290
1291 // case when in-memory representation is UTF-16
1292 #ifdef WC_UTF16
1293
1294 // ----------------------------------------------------------------------------
1295 // conversions without endianness change
1296 // ----------------------------------------------------------------------------
1297
1298 size_t
1299 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1300 const char *src, size_t srcLen) const
1301 {
1302 srcLen = GetLength(src, srcLen);
1303 if ( srcLen == wxNO_LEN )
1304 return wxCONV_FAILED;
1305
1306 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1307 const size_t inLen = srcLen / BYTES_PER_CHAR;
1308 size_t outLen = 0;
1309 for ( size_t n = 0; n < inLen; n++ )
1310 {
1311 wxUint16 cc[2];
1312 const size_t numChars = encode_utf16(*inBuff++, cc);
1313 if ( numChars == wxCONV_FAILED )
1314 return wxCONV_FAILED;
1315
1316 outLen += numChars;
1317 if ( dst )
1318 {
1319 if ( outLen > dstLen )
1320 return wxCONV_FAILED;
1321
1322 *dst++ = cc[0];
1323 if ( numChars == 2 )
1324 {
1325 // second character of a surrogate
1326 *dst++ = cc[1];
1327 }
1328 }
1329 }
1330
1331 return outLen;
1332 }
1333
1334 size_t
1335 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1336 const wchar_t *src, size_t srcLen) const
1337 {
1338 if ( srcLen == wxNO_LEN )
1339 srcLen = wxWcslen(src) + 1;
1340
1341 if ( !dst )
1342 {
1343 // optimization: return maximal space which could be needed for this
1344 // string instead of the exact amount which could be less if there are
1345 // any surrogates in the input
1346 //
1347 // we consider that surrogates are rare enough to make it worthwhile to
1348 // avoid running the loop below at the cost of slightly extra memory
1349 // consumption
1350 return srcLen * BYTES_PER_CHAR;
1351 }
1352
1353 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1354 size_t outLen = 0;
1355 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1356 {
1357 const wxUint32 ch = wxDecodeSurrogate(&src);
1358 if ( !src )
1359 return wxCONV_FAILED;
1360
1361 outLen += BYTES_PER_CHAR;
1362
1363 if ( outLen > dstLen )
1364 return wxCONV_FAILED;
1365
1366 *outBuff++ = ch;
1367 }
1368
1369 return outLen;
1370 }
1371
1372 // ----------------------------------------------------------------------------
1373 // endian-reversing conversions
1374 // ----------------------------------------------------------------------------
1375
1376 size_t
1377 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1378 const char *src, size_t srcLen) const
1379 {
1380 srcLen = GetLength(src, srcLen);
1381 if ( srcLen == wxNO_LEN )
1382 return wxCONV_FAILED;
1383
1384 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1385 const size_t inLen = srcLen / BYTES_PER_CHAR;
1386 size_t outLen = 0;
1387 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1388 {
1389 wxUint16 cc[2];
1390 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1391 if ( numChars == wxCONV_FAILED )
1392 return wxCONV_FAILED;
1393
1394 outLen += numChars;
1395 if ( dst )
1396 {
1397 if ( outLen > dstLen )
1398 return wxCONV_FAILED;
1399
1400 *dst++ = cc[0];
1401 if ( numChars == 2 )
1402 {
1403 // second character of a surrogate
1404 *dst++ = cc[1];
1405 }
1406 }
1407 }
1408
1409 return outLen;
1410 }
1411
1412 size_t
1413 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1414 const wchar_t *src, size_t srcLen) const
1415 {
1416 if ( srcLen == wxNO_LEN )
1417 srcLen = wxWcslen(src) + 1;
1418
1419 if ( !dst )
1420 {
1421 // optimization: return maximal space which could be needed for this
1422 // string instead of the exact amount which could be less if there are
1423 // any surrogates in the input
1424 //
1425 // we consider that surrogates are rare enough to make it worthwhile to
1426 // avoid running the loop below at the cost of slightly extra memory
1427 // consumption
1428 return srcLen*BYTES_PER_CHAR;
1429 }
1430
1431 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1432 size_t outLen = 0;
1433 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1434 {
1435 const wxUint32 ch = wxDecodeSurrogate(&src);
1436 if ( !src )
1437 return wxCONV_FAILED;
1438
1439 outLen += BYTES_PER_CHAR;
1440
1441 if ( outLen > dstLen )
1442 return wxCONV_FAILED;
1443
1444 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1445 }
1446
1447 return outLen;
1448 }
1449
1450 #else // !WC_UTF16: wchar_t is UTF-32
1451
1452 // ----------------------------------------------------------------------------
1453 // conversions without endianness change
1454 // ----------------------------------------------------------------------------
1455
1456 size_t
1457 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1458 const char *src, size_t srcLen) const
1459 {
1460 // use memcpy() as it should be much faster than hand-written loop
1461 srcLen = GetLength(src, srcLen);
1462 if ( srcLen == wxNO_LEN )
1463 return wxCONV_FAILED;
1464
1465 const size_t inLen = srcLen/BYTES_PER_CHAR;
1466 if ( dst )
1467 {
1468 if ( dstLen < inLen )
1469 return wxCONV_FAILED;
1470
1471 memcpy(dst, src, srcLen);
1472 }
1473
1474 return inLen;
1475 }
1476
1477 size_t
1478 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1479 const wchar_t *src, size_t srcLen) const
1480 {
1481 if ( srcLen == wxNO_LEN )
1482 srcLen = wxWcslen(src) + 1;
1483
1484 srcLen *= BYTES_PER_CHAR;
1485
1486 if ( dst )
1487 {
1488 if ( dstLen < srcLen )
1489 return wxCONV_FAILED;
1490
1491 memcpy(dst, src, srcLen);
1492 }
1493
1494 return srcLen;
1495 }
1496
1497 // ----------------------------------------------------------------------------
1498 // endian-reversing conversions
1499 // ----------------------------------------------------------------------------
1500
1501 size_t
1502 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1503 const char *src, size_t srcLen) const
1504 {
1505 srcLen = GetLength(src, srcLen);
1506 if ( srcLen == wxNO_LEN )
1507 return wxCONV_FAILED;
1508
1509 srcLen /= BYTES_PER_CHAR;
1510
1511 if ( dst )
1512 {
1513 if ( dstLen < srcLen )
1514 return wxCONV_FAILED;
1515
1516 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1517 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1518 {
1519 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1520 }
1521 }
1522
1523 return srcLen;
1524 }
1525
1526 size_t
1527 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1528 const wchar_t *src, size_t srcLen) const
1529 {
1530 if ( srcLen == wxNO_LEN )
1531 srcLen = wxWcslen(src) + 1;
1532
1533 srcLen *= BYTES_PER_CHAR;
1534
1535 if ( dst )
1536 {
1537 if ( dstLen < srcLen )
1538 return wxCONV_FAILED;
1539
1540 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1541 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1542 {
1543 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1544 }
1545 }
1546
1547 return srcLen;
1548 }
1549
1550 #endif // WC_UTF16/!WC_UTF16
1551
1552
1553 // ============================================================================
1554 // The classes doing conversion using the iconv_xxx() functions
1555 // ============================================================================
1556
1557 #ifdef HAVE_ICONV
1558
1559 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1560 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1561 // (unless there's yet another bug in glibc) the only case when iconv()
1562 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1563 // left in the input buffer -- when _real_ error occurs,
1564 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1565 // iconv() failure.
1566 // [This bug does not appear in glibc 2.2.]
1567 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1568 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1569 (errno != E2BIG || bufLeft != 0))
1570 #else
1571 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1572 #endif
1573
1574 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1575
1576 #define ICONV_T_INVALID ((iconv_t)-1)
1577
1578 #if SIZEOF_WCHAR_T == 4
1579 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1580 #define WC_ENC wxFONTENCODING_UTF32
1581 #elif SIZEOF_WCHAR_T == 2
1582 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1583 #define WC_ENC wxFONTENCODING_UTF16
1584 #else // sizeof(wchar_t) != 2 nor 4
1585 // does this ever happen?
1586 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1587 #endif
1588
1589 // ----------------------------------------------------------------------------
1590 // wxMBConv_iconv: encapsulates an iconv character set
1591 // ----------------------------------------------------------------------------
1592
1593 class wxMBConv_iconv : public wxMBConv
1594 {
1595 public:
1596 wxMBConv_iconv(const char *name);
1597 virtual ~wxMBConv_iconv();
1598
1599 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1600 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1601
1602 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1603 virtual size_t GetMBNulLen() const;
1604
1605 #if wxUSE_UNICODE_UTF8
1606 virtual bool IsUTF8() const;
1607 #endif
1608
1609 virtual wxMBConv *Clone() const
1610 {
1611 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1612 p->m_minMBCharWidth = m_minMBCharWidth;
1613 return p;
1614 }
1615
1616 bool IsOk() const
1617 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1618
1619 protected:
1620 // the iconv handlers used to translate from multibyte
1621 // to wide char and in the other direction
1622 iconv_t m2w,
1623 w2m;
1624
1625 #if wxUSE_THREADS
1626 // guards access to m2w and w2m objects
1627 wxMutex m_iconvMutex;
1628 #endif
1629
1630 private:
1631 // the name (for iconv_open()) of a wide char charset -- if none is
1632 // available on this machine, it will remain NULL
1633 static wxString ms_wcCharsetName;
1634
1635 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1636 // different endian-ness than the native one
1637 static bool ms_wcNeedsSwap;
1638
1639
1640 // name of the encoding handled by this conversion
1641 wxString m_name;
1642
1643 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1644 // initially
1645 size_t m_minMBCharWidth;
1646 };
1647
1648 // make the constructor available for unit testing
1649 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1650 {
1651 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1652 if ( !result->IsOk() )
1653 {
1654 delete result;
1655 return 0;
1656 }
1657
1658 return result;
1659 }
1660
1661 wxString wxMBConv_iconv::ms_wcCharsetName;
1662 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1663
1664 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1665 : m_name(name)
1666 {
1667 m_minMBCharWidth = 0;
1668
1669 // check for charset that represents wchar_t:
1670 if ( ms_wcCharsetName.empty() )
1671 {
1672 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1673
1674 #if wxUSE_FONTMAP
1675 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1676 #else // !wxUSE_FONTMAP
1677 static const wxChar *names_static[] =
1678 {
1679 #if SIZEOF_WCHAR_T == 4
1680 _T("UCS-4"),
1681 #elif SIZEOF_WCHAR_T = 2
1682 _T("UCS-2"),
1683 #endif
1684 NULL
1685 };
1686 const wxChar **names = names_static;
1687 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1688
1689 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1690 {
1691 const wxString nameCS(*names);
1692
1693 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1694 wxString nameXE(nameCS);
1695
1696 #ifdef WORDS_BIGENDIAN
1697 nameXE += _T("BE");
1698 #else // little endian
1699 nameXE += _T("LE");
1700 #endif
1701
1702 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1703 nameXE.c_str());
1704
1705 m2w = iconv_open(nameXE.ToAscii(), name);
1706 if ( m2w == ICONV_T_INVALID )
1707 {
1708 // try charset w/o bytesex info (e.g. "UCS4")
1709 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1710 nameCS.c_str());
1711 m2w = iconv_open(nameCS.ToAscii(), name);
1712
1713 // and check for bytesex ourselves:
1714 if ( m2w != ICONV_T_INVALID )
1715 {
1716 char buf[2], *bufPtr;
1717 wchar_t wbuf[2], *wbufPtr;
1718 size_t insz, outsz;
1719 size_t res;
1720
1721 buf[0] = 'A';
1722 buf[1] = 0;
1723 wbuf[0] = 0;
1724 insz = 2;
1725 outsz = SIZEOF_WCHAR_T * 2;
1726 wbufPtr = wbuf;
1727 bufPtr = buf;
1728
1729 res = iconv(
1730 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1731 (char**)&wbufPtr, &outsz);
1732
1733 if (ICONV_FAILED(res, insz))
1734 {
1735 wxLogLastError(wxT("iconv"));
1736 wxLogError(_("Conversion to charset '%s' doesn't work."),
1737 nameCS.c_str());
1738 }
1739 else // ok, can convert to this encoding, remember it
1740 {
1741 ms_wcCharsetName = nameCS;
1742 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1743 }
1744 }
1745 }
1746 else // use charset not requiring byte swapping
1747 {
1748 ms_wcCharsetName = nameXE;
1749 }
1750 }
1751
1752 wxLogTrace(TRACE_STRCONV,
1753 wxT("iconv wchar_t charset is \"%s\"%s"),
1754 ms_wcCharsetName.empty() ? wxString("<none>")
1755 : ms_wcCharsetName,
1756 ms_wcNeedsSwap ? _T(" (needs swap)")
1757 : _T(""));
1758 }
1759 else // we already have ms_wcCharsetName
1760 {
1761 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
1762 }
1763
1764 if ( ms_wcCharsetName.empty() )
1765 {
1766 w2m = ICONV_T_INVALID;
1767 }
1768 else
1769 {
1770 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
1771 if ( w2m == ICONV_T_INVALID )
1772 {
1773 wxLogTrace(TRACE_STRCONV,
1774 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1775 ms_wcCharsetName.c_str(), name);
1776 }
1777 }
1778 }
1779
1780 wxMBConv_iconv::~wxMBConv_iconv()
1781 {
1782 if ( m2w != ICONV_T_INVALID )
1783 iconv_close(m2w);
1784 if ( w2m != ICONV_T_INVALID )
1785 iconv_close(w2m);
1786 }
1787
1788 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1789 {
1790 // find the string length: notice that must be done differently for
1791 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1792 size_t inbuf;
1793 const size_t nulLen = GetMBNulLen();
1794 switch ( nulLen )
1795 {
1796 default:
1797 return wxCONV_FAILED;
1798
1799 case 1:
1800 inbuf = strlen(psz); // arguably more optimized than our version
1801 break;
1802
1803 case 2:
1804 case 4:
1805 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1806 // they also have to start at character boundary and not span two
1807 // adjacent characters
1808 const char *p;
1809 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1810 ;
1811 inbuf = p - psz;
1812 break;
1813 }
1814
1815 #if wxUSE_THREADS
1816 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1817 // Unfortunately there are a couple of global wxCSConv objects such as
1818 // wxConvLocal that are used all over wx code, so we have to make sure
1819 // the handle is used by at most one thread at the time. Otherwise
1820 // only a few wx classes would be safe to use from non-main threads
1821 // as MB<->WC conversion would fail "randomly".
1822 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1823 #endif // wxUSE_THREADS
1824
1825 size_t outbuf = n * SIZEOF_WCHAR_T;
1826 size_t res, cres;
1827 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1828 wchar_t *bufPtr = buf;
1829 const char *pszPtr = psz;
1830
1831 if (buf)
1832 {
1833 // have destination buffer, convert there
1834 cres = iconv(m2w,
1835 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1836 (char**)&bufPtr, &outbuf);
1837 res = n - (outbuf / SIZEOF_WCHAR_T);
1838
1839 if (ms_wcNeedsSwap)
1840 {
1841 // convert to native endianness
1842 for ( unsigned i = 0; i < res; i++ )
1843 buf[n] = WC_BSWAP(buf[i]);
1844 }
1845
1846 // NUL-terminate the string if there is any space left
1847 if (res < n)
1848 buf[res] = 0;
1849 }
1850 else
1851 {
1852 // no destination buffer... convert using temp buffer
1853 // to calculate destination buffer requirement
1854 wchar_t tbuf[8];
1855 res = 0;
1856
1857 do
1858 {
1859 bufPtr = tbuf;
1860 outbuf = 8 * SIZEOF_WCHAR_T;
1861
1862 cres = iconv(m2w,
1863 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1864 (char**)&bufPtr, &outbuf );
1865
1866 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1867 }
1868 while ((cres == (size_t)-1) && (errno == E2BIG));
1869 }
1870
1871 if (ICONV_FAILED(cres, inbuf))
1872 {
1873 //VS: it is ok if iconv fails, hence trace only
1874 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1875 return wxCONV_FAILED;
1876 }
1877
1878 return res;
1879 }
1880
1881 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1882 {
1883 #if wxUSE_THREADS
1884 // NB: explained in MB2WC
1885 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1886 #endif
1887
1888 size_t inlen = wxWcslen(psz);
1889 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1890 size_t outbuf = n;
1891 size_t res, cres;
1892
1893 wchar_t *tmpbuf = 0;
1894
1895 if (ms_wcNeedsSwap)
1896 {
1897 // need to copy to temp buffer to switch endianness
1898 // (doing WC_BSWAP twice on the original buffer won't help, as it
1899 // could be in read-only memory, or be accessed in some other thread)
1900 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1901 for ( size_t i = 0; i < inlen; i++ )
1902 tmpbuf[n] = WC_BSWAP(psz[i]);
1903
1904 tmpbuf[inlen] = L'\0';
1905 psz = tmpbuf;
1906 }
1907
1908 if (buf)
1909 {
1910 // have destination buffer, convert there
1911 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1912
1913 res = n - outbuf;
1914
1915 // NB: iconv was given only wcslen(psz) characters on input, and so
1916 // it couldn't convert the trailing zero. Let's do it ourselves
1917 // if there's some room left for it in the output buffer.
1918 if (res < n)
1919 buf[0] = 0;
1920 }
1921 else
1922 {
1923 // no destination buffer: convert using temp buffer
1924 // to calculate destination buffer requirement
1925 char tbuf[16];
1926 res = 0;
1927 do
1928 {
1929 buf = tbuf;
1930 outbuf = 16;
1931
1932 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1933
1934 res += 16 - outbuf;
1935 }
1936 while ((cres == (size_t)-1) && (errno == E2BIG));
1937 }
1938
1939 if (ms_wcNeedsSwap)
1940 {
1941 free(tmpbuf);
1942 }
1943
1944 if (ICONV_FAILED(cres, inbuf))
1945 {
1946 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1947 return wxCONV_FAILED;
1948 }
1949
1950 return res;
1951 }
1952
1953 size_t wxMBConv_iconv::GetMBNulLen() const
1954 {
1955 if ( m_minMBCharWidth == 0 )
1956 {
1957 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1958
1959 #if wxUSE_THREADS
1960 // NB: explained in MB2WC
1961 wxMutexLocker lock(self->m_iconvMutex);
1962 #endif
1963
1964 const wchar_t *wnul = L"";
1965 char buf[8]; // should be enough for NUL in any encoding
1966 size_t inLen = sizeof(wchar_t),
1967 outLen = WXSIZEOF(buf);
1968 char *inBuff = (char *)wnul;
1969 char *outBuff = buf;
1970 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1971 {
1972 self->m_minMBCharWidth = (size_t)-1;
1973 }
1974 else // ok
1975 {
1976 self->m_minMBCharWidth = outBuff - buf;
1977 }
1978 }
1979
1980 return m_minMBCharWidth;
1981 }
1982
1983 #if wxUSE_UNICODE_UTF8
1984 bool wxMBConv_iconv::IsUTF8() const
1985 {
1986 return wxStricmp(m_name, "UTF-8") == 0 ||
1987 wxStricmp(m_name, "UTF8") == 0;
1988 }
1989 #endif
1990
1991 #endif // HAVE_ICONV
1992
1993
1994 // ============================================================================
1995 // Win32 conversion classes
1996 // ============================================================================
1997
1998 #ifdef wxHAVE_WIN32_MB2WC
1999
2000 // from utils.cpp
2001 #if wxUSE_FONTMAP
2002 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2003 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2004 #endif
2005
2006 class wxMBConv_win32 : public wxMBConv
2007 {
2008 public:
2009 wxMBConv_win32()
2010 {
2011 m_CodePage = CP_ACP;
2012 m_minMBCharWidth = 0;
2013 }
2014
2015 wxMBConv_win32(const wxMBConv_win32& conv)
2016 : wxMBConv()
2017 {
2018 m_CodePage = conv.m_CodePage;
2019 m_minMBCharWidth = conv.m_minMBCharWidth;
2020 }
2021
2022 #if wxUSE_FONTMAP
2023 wxMBConv_win32(const char* name)
2024 {
2025 m_CodePage = wxCharsetToCodepage(name);
2026 m_minMBCharWidth = 0;
2027 }
2028
2029 wxMBConv_win32(wxFontEncoding encoding)
2030 {
2031 m_CodePage = wxEncodingToCodepage(encoding);
2032 m_minMBCharWidth = 0;
2033 }
2034 #endif // wxUSE_FONTMAP
2035
2036 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2037 {
2038 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2039 // the behaviour is not compatible with the Unix version (using iconv)
2040 // and break the library itself, e.g. wxTextInputStream::NextChar()
2041 // wouldn't work if reading an incomplete MB char didn't result in an
2042 // error
2043 //
2044 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2045 // Win XP or newer and it is not supported for UTF-[78] so we always
2046 // use our own conversions in this case. See
2047 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2048 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2049 if ( m_CodePage == CP_UTF8 )
2050 {
2051 return wxMBConvUTF8().MB2WC(buf, psz, n);
2052 }
2053
2054 if ( m_CodePage == CP_UTF7 )
2055 {
2056 return wxMBConvUTF7().MB2WC(buf, psz, n);
2057 }
2058
2059 int flags = 0;
2060 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2061 IsAtLeastWin2kSP4() )
2062 {
2063 flags = MB_ERR_INVALID_CHARS;
2064 }
2065
2066 const size_t len = ::MultiByteToWideChar
2067 (
2068 m_CodePage, // code page
2069 flags, // flags: fall on error
2070 psz, // input string
2071 -1, // its length (NUL-terminated)
2072 buf, // output string
2073 buf ? n : 0 // size of output buffer
2074 );
2075 if ( !len )
2076 {
2077 // function totally failed
2078 return wxCONV_FAILED;
2079 }
2080
2081 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2082 // check if we succeeded, by doing a double trip:
2083 if ( !flags && buf )
2084 {
2085 const size_t mbLen = strlen(psz);
2086 wxCharBuffer mbBuf(mbLen);
2087 if ( ::WideCharToMultiByte
2088 (
2089 m_CodePage,
2090 0,
2091 buf,
2092 -1,
2093 mbBuf.data(),
2094 mbLen + 1, // size in bytes, not length
2095 NULL,
2096 NULL
2097 ) == 0 ||
2098 strcmp(mbBuf, psz) != 0 )
2099 {
2100 // we didn't obtain the same thing we started from, hence
2101 // the conversion was lossy and we consider that it failed
2102 return wxCONV_FAILED;
2103 }
2104 }
2105
2106 // note that it returns count of written chars for buf != NULL and size
2107 // of the needed buffer for buf == NULL so in either case the length of
2108 // the string (which never includes the terminating NUL) is one less
2109 return len - 1;
2110 }
2111
2112 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2113 {
2114 /*
2115 we have a problem here: by default, WideCharToMultiByte() may
2116 replace characters unrepresentable in the target code page with bad
2117 quality approximations such as turning "1/2" symbol (U+00BD) into
2118 "1" for the code pages which don't have it and we, obviously, want
2119 to avoid this at any price
2120
2121 the trouble is that this function does it _silently_, i.e. it won't
2122 even tell us whether it did or not... Win98/2000 and higher provide
2123 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2124 we have to resort to a round trip, i.e. check that converting back
2125 results in the same string -- this is, of course, expensive but
2126 otherwise we simply can't be sure to not garble the data.
2127 */
2128
2129 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2130 // it doesn't work with CJK encodings (which we test for rather roughly
2131 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2132 // supporting it
2133 BOOL usedDef wxDUMMY_INITIALIZE(false);
2134 BOOL *pUsedDef;
2135 int flags;
2136 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2137 {
2138 // it's our lucky day
2139 flags = WC_NO_BEST_FIT_CHARS;
2140 pUsedDef = &usedDef;
2141 }
2142 else // old system or unsupported encoding
2143 {
2144 flags = 0;
2145 pUsedDef = NULL;
2146 }
2147
2148 const size_t len = ::WideCharToMultiByte
2149 (
2150 m_CodePage, // code page
2151 flags, // either none or no best fit
2152 pwz, // input string
2153 -1, // it is (wide) NUL-terminated
2154 buf, // output buffer
2155 buf ? n : 0, // and its size
2156 NULL, // default "replacement" char
2157 pUsedDef // [out] was it used?
2158 );
2159
2160 if ( !len )
2161 {
2162 // function totally failed
2163 return wxCONV_FAILED;
2164 }
2165
2166 // if we were really converting, check if we succeeded
2167 if ( buf )
2168 {
2169 if ( flags )
2170 {
2171 // check if the conversion failed, i.e. if any replacements
2172 // were done
2173 if ( usedDef )
2174 return wxCONV_FAILED;
2175 }
2176 else // we must resort to double tripping...
2177 {
2178 wxWCharBuffer wcBuf(n);
2179 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2180 wcscmp(wcBuf, pwz) != 0 )
2181 {
2182 // we didn't obtain the same thing we started from, hence
2183 // the conversion was lossy and we consider that it failed
2184 return wxCONV_FAILED;
2185 }
2186 }
2187 }
2188
2189 // see the comment above for the reason of "len - 1"
2190 return len - 1;
2191 }
2192
2193 virtual size_t GetMBNulLen() const
2194 {
2195 if ( m_minMBCharWidth == 0 )
2196 {
2197 int len = ::WideCharToMultiByte
2198 (
2199 m_CodePage, // code page
2200 0, // no flags
2201 L"", // input string
2202 1, // translate just the NUL
2203 NULL, // output buffer
2204 0, // and its size
2205 NULL, // no replacement char
2206 NULL // [out] don't care if it was used
2207 );
2208
2209 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2210 switch ( len )
2211 {
2212 default:
2213 wxLogDebug(_T("Unexpected NUL length %d"), len);
2214 self->m_minMBCharWidth = (size_t)-1;
2215 break;
2216
2217 case 0:
2218 self->m_minMBCharWidth = (size_t)-1;
2219 break;
2220
2221 case 1:
2222 case 2:
2223 case 4:
2224 self->m_minMBCharWidth = len;
2225 break;
2226 }
2227 }
2228
2229 return m_minMBCharWidth;
2230 }
2231
2232 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2233
2234 bool IsOk() const { return m_CodePage != -1; }
2235
2236 private:
2237 static bool CanUseNoBestFit()
2238 {
2239 static int s_isWin98Or2k = -1;
2240
2241 if ( s_isWin98Or2k == -1 )
2242 {
2243 int verMaj, verMin;
2244 switch ( wxGetOsVersion(&verMaj, &verMin) )
2245 {
2246 case wxOS_WINDOWS_9X:
2247 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2248 break;
2249
2250 case wxOS_WINDOWS_NT:
2251 s_isWin98Or2k = verMaj >= 5;
2252 break;
2253
2254 default:
2255 // unknown: be conservative by default
2256 s_isWin98Or2k = 0;
2257 break;
2258 }
2259
2260 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2261 }
2262
2263 return s_isWin98Or2k == 1;
2264 }
2265
2266 static bool IsAtLeastWin2kSP4()
2267 {
2268 #ifdef __WXWINCE__
2269 return false;
2270 #else
2271 static int s_isAtLeastWin2kSP4 = -1;
2272
2273 if ( s_isAtLeastWin2kSP4 == -1 )
2274 {
2275 OSVERSIONINFOEX ver;
2276
2277 memset(&ver, 0, sizeof(ver));
2278 ver.dwOSVersionInfoSize = sizeof(ver);
2279 GetVersionEx((OSVERSIONINFO*)&ver);
2280
2281 s_isAtLeastWin2kSP4 =
2282 ((ver.dwMajorVersion > 5) || // Vista+
2283 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2284 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2285 ver.wServicePackMajor >= 4)) // 2000 SP4+
2286 ? 1 : 0;
2287 }
2288
2289 return s_isAtLeastWin2kSP4 == 1;
2290 #endif
2291 }
2292
2293
2294 // the code page we're working with
2295 long m_CodePage;
2296
2297 // cached result of GetMBNulLen(), set to 0 initially meaning
2298 // "unknown"
2299 size_t m_minMBCharWidth;
2300 };
2301
2302 #endif // wxHAVE_WIN32_MB2WC
2303
2304 // ============================================================================
2305 // CoreFoundation conversion classes
2306 // ============================================================================
2307
2308 #ifdef __DARWIN__
2309
2310 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2311 // Strangely enough, internally Core Foundation uses
2312 // UTF-32 internally quite a bit - its just not public (yet).
2313
2314 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2315 {
2316 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2317
2318 switch (encoding)
2319 {
2320 case wxFONTENCODING_DEFAULT :
2321 enc = CFStringGetSystemEncoding();
2322 break ;
2323
2324 case wxFONTENCODING_ISO8859_1 :
2325 enc = kCFStringEncodingISOLatin1 ;
2326 break ;
2327 case wxFONTENCODING_ISO8859_2 :
2328 enc = kCFStringEncodingISOLatin2;
2329 break ;
2330 case wxFONTENCODING_ISO8859_3 :
2331 enc = kCFStringEncodingISOLatin3 ;
2332 break ;
2333 case wxFONTENCODING_ISO8859_4 :
2334 enc = kCFStringEncodingISOLatin4;
2335 break ;
2336 case wxFONTENCODING_ISO8859_5 :
2337 enc = kCFStringEncodingISOLatinCyrillic;
2338 break ;
2339 case wxFONTENCODING_ISO8859_6 :
2340 enc = kCFStringEncodingISOLatinArabic;
2341 break ;
2342 case wxFONTENCODING_ISO8859_7 :
2343 enc = kCFStringEncodingISOLatinGreek;
2344 break ;
2345 case wxFONTENCODING_ISO8859_8 :
2346 enc = kCFStringEncodingISOLatinHebrew;
2347 break ;
2348 case wxFONTENCODING_ISO8859_9 :
2349 enc = kCFStringEncodingISOLatin5;
2350 break ;
2351 case wxFONTENCODING_ISO8859_10 :
2352 enc = kCFStringEncodingISOLatin6;
2353 break ;
2354 case wxFONTENCODING_ISO8859_11 :
2355 enc = kCFStringEncodingISOLatinThai;
2356 break ;
2357 case wxFONTENCODING_ISO8859_13 :
2358 enc = kCFStringEncodingISOLatin7;
2359 break ;
2360 case wxFONTENCODING_ISO8859_14 :
2361 enc = kCFStringEncodingISOLatin8;
2362 break ;
2363 case wxFONTENCODING_ISO8859_15 :
2364 enc = kCFStringEncodingISOLatin9;
2365 break ;
2366
2367 case wxFONTENCODING_KOI8 :
2368 enc = kCFStringEncodingKOI8_R;
2369 break ;
2370 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2371 enc = kCFStringEncodingDOSRussian;
2372 break ;
2373
2374 // case wxFONTENCODING_BULGARIAN :
2375 // enc = ;
2376 // break ;
2377
2378 case wxFONTENCODING_CP437 :
2379 enc = kCFStringEncodingDOSLatinUS ;
2380 break ;
2381 case wxFONTENCODING_CP850 :
2382 enc = kCFStringEncodingDOSLatin1;
2383 break ;
2384 case wxFONTENCODING_CP852 :
2385 enc = kCFStringEncodingDOSLatin2;
2386 break ;
2387 case wxFONTENCODING_CP855 :
2388 enc = kCFStringEncodingDOSCyrillic;
2389 break ;
2390 case wxFONTENCODING_CP866 :
2391 enc = kCFStringEncodingDOSRussian ;
2392 break ;
2393 case wxFONTENCODING_CP874 :
2394 enc = kCFStringEncodingDOSThai;
2395 break ;
2396 case wxFONTENCODING_CP932 :
2397 enc = kCFStringEncodingDOSJapanese;
2398 break ;
2399 case wxFONTENCODING_CP936 :
2400 enc = kCFStringEncodingDOSChineseSimplif ;
2401 break ;
2402 case wxFONTENCODING_CP949 :
2403 enc = kCFStringEncodingDOSKorean;
2404 break ;
2405 case wxFONTENCODING_CP950 :
2406 enc = kCFStringEncodingDOSChineseTrad;
2407 break ;
2408 case wxFONTENCODING_CP1250 :
2409 enc = kCFStringEncodingWindowsLatin2;
2410 break ;
2411 case wxFONTENCODING_CP1251 :
2412 enc = kCFStringEncodingWindowsCyrillic ;
2413 break ;
2414 case wxFONTENCODING_CP1252 :
2415 enc = kCFStringEncodingWindowsLatin1 ;
2416 break ;
2417 case wxFONTENCODING_CP1253 :
2418 enc = kCFStringEncodingWindowsGreek;
2419 break ;
2420 case wxFONTENCODING_CP1254 :
2421 enc = kCFStringEncodingWindowsLatin5;
2422 break ;
2423 case wxFONTENCODING_CP1255 :
2424 enc = kCFStringEncodingWindowsHebrew ;
2425 break ;
2426 case wxFONTENCODING_CP1256 :
2427 enc = kCFStringEncodingWindowsArabic ;
2428 break ;
2429 case wxFONTENCODING_CP1257 :
2430 enc = kCFStringEncodingWindowsBalticRim;
2431 break ;
2432 // This only really encodes to UTF7 (if that) evidently
2433 // case wxFONTENCODING_UTF7 :
2434 // enc = kCFStringEncodingNonLossyASCII ;
2435 // break ;
2436 case wxFONTENCODING_UTF8 :
2437 enc = kCFStringEncodingUTF8 ;
2438 break ;
2439 case wxFONTENCODING_EUC_JP :
2440 enc = kCFStringEncodingEUC_JP;
2441 break ;
2442 case wxFONTENCODING_UTF16 :
2443 enc = kCFStringEncodingUnicode ;
2444 break ;
2445 case wxFONTENCODING_MACROMAN :
2446 enc = kCFStringEncodingMacRoman ;
2447 break ;
2448 case wxFONTENCODING_MACJAPANESE :
2449 enc = kCFStringEncodingMacJapanese ;
2450 break ;
2451 case wxFONTENCODING_MACCHINESETRAD :
2452 enc = kCFStringEncodingMacChineseTrad ;
2453 break ;
2454 case wxFONTENCODING_MACKOREAN :
2455 enc = kCFStringEncodingMacKorean ;
2456 break ;
2457 case wxFONTENCODING_MACARABIC :
2458 enc = kCFStringEncodingMacArabic ;
2459 break ;
2460 case wxFONTENCODING_MACHEBREW :
2461 enc = kCFStringEncodingMacHebrew ;
2462 break ;
2463 case wxFONTENCODING_MACGREEK :
2464 enc = kCFStringEncodingMacGreek ;
2465 break ;
2466 case wxFONTENCODING_MACCYRILLIC :
2467 enc = kCFStringEncodingMacCyrillic ;
2468 break ;
2469 case wxFONTENCODING_MACDEVANAGARI :
2470 enc = kCFStringEncodingMacDevanagari ;
2471 break ;
2472 case wxFONTENCODING_MACGURMUKHI :
2473 enc = kCFStringEncodingMacGurmukhi ;
2474 break ;
2475 case wxFONTENCODING_MACGUJARATI :
2476 enc = kCFStringEncodingMacGujarati ;
2477 break ;
2478 case wxFONTENCODING_MACORIYA :
2479 enc = kCFStringEncodingMacOriya ;
2480 break ;
2481 case wxFONTENCODING_MACBENGALI :
2482 enc = kCFStringEncodingMacBengali ;
2483 break ;
2484 case wxFONTENCODING_MACTAMIL :
2485 enc = kCFStringEncodingMacTamil ;
2486 break ;
2487 case wxFONTENCODING_MACTELUGU :
2488 enc = kCFStringEncodingMacTelugu ;
2489 break ;
2490 case wxFONTENCODING_MACKANNADA :
2491 enc = kCFStringEncodingMacKannada ;
2492 break ;
2493 case wxFONTENCODING_MACMALAJALAM :
2494 enc = kCFStringEncodingMacMalayalam ;
2495 break ;
2496 case wxFONTENCODING_MACSINHALESE :
2497 enc = kCFStringEncodingMacSinhalese ;
2498 break ;
2499 case wxFONTENCODING_MACBURMESE :
2500 enc = kCFStringEncodingMacBurmese ;
2501 break ;
2502 case wxFONTENCODING_MACKHMER :
2503 enc = kCFStringEncodingMacKhmer ;
2504 break ;
2505 case wxFONTENCODING_MACTHAI :
2506 enc = kCFStringEncodingMacThai ;
2507 break ;
2508 case wxFONTENCODING_MACLAOTIAN :
2509 enc = kCFStringEncodingMacLaotian ;
2510 break ;
2511 case wxFONTENCODING_MACGEORGIAN :
2512 enc = kCFStringEncodingMacGeorgian ;
2513 break ;
2514 case wxFONTENCODING_MACARMENIAN :
2515 enc = kCFStringEncodingMacArmenian ;
2516 break ;
2517 case wxFONTENCODING_MACCHINESESIMP :
2518 enc = kCFStringEncodingMacChineseSimp ;
2519 break ;
2520 case wxFONTENCODING_MACTIBETAN :
2521 enc = kCFStringEncodingMacTibetan ;
2522 break ;
2523 case wxFONTENCODING_MACMONGOLIAN :
2524 enc = kCFStringEncodingMacMongolian ;
2525 break ;
2526 case wxFONTENCODING_MACETHIOPIC :
2527 enc = kCFStringEncodingMacEthiopic ;
2528 break ;
2529 case wxFONTENCODING_MACCENTRALEUR :
2530 enc = kCFStringEncodingMacCentralEurRoman ;
2531 break ;
2532 case wxFONTENCODING_MACVIATNAMESE :
2533 enc = kCFStringEncodingMacVietnamese ;
2534 break ;
2535 case wxFONTENCODING_MACARABICEXT :
2536 enc = kCFStringEncodingMacExtArabic ;
2537 break ;
2538 case wxFONTENCODING_MACSYMBOL :
2539 enc = kCFStringEncodingMacSymbol ;
2540 break ;
2541 case wxFONTENCODING_MACDINGBATS :
2542 enc = kCFStringEncodingMacDingbats ;
2543 break ;
2544 case wxFONTENCODING_MACTURKISH :
2545 enc = kCFStringEncodingMacTurkish ;
2546 break ;
2547 case wxFONTENCODING_MACCROATIAN :
2548 enc = kCFStringEncodingMacCroatian ;
2549 break ;
2550 case wxFONTENCODING_MACICELANDIC :
2551 enc = kCFStringEncodingMacIcelandic ;
2552 break ;
2553 case wxFONTENCODING_MACROMANIAN :
2554 enc = kCFStringEncodingMacRomanian ;
2555 break ;
2556 case wxFONTENCODING_MACCELTIC :
2557 enc = kCFStringEncodingMacCeltic ;
2558 break ;
2559 case wxFONTENCODING_MACGAELIC :
2560 enc = kCFStringEncodingMacGaelic ;
2561 break ;
2562 // case wxFONTENCODING_MACKEYBOARD :
2563 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2564 // break ;
2565
2566 default :
2567 // because gcc is picky
2568 break ;
2569 }
2570
2571 return enc ;
2572 }
2573
2574 class wxMBConv_cf : public wxMBConv
2575 {
2576 public:
2577 wxMBConv_cf()
2578 {
2579 Init(CFStringGetSystemEncoding()) ;
2580 }
2581
2582 wxMBConv_cf(const wxMBConv_cf& conv)
2583 {
2584 m_encoding = conv.m_encoding;
2585 }
2586
2587 #if wxUSE_FONTMAP
2588 wxMBConv_cf(const char* name)
2589 {
2590 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2591 }
2592 #endif
2593
2594 wxMBConv_cf(wxFontEncoding encoding)
2595 {
2596 Init( wxCFStringEncFromFontEnc(encoding) );
2597 }
2598
2599 virtual ~wxMBConv_cf()
2600 {
2601 }
2602
2603 void Init( CFStringEncoding encoding)
2604 {
2605 m_encoding = encoding ;
2606 }
2607
2608 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2609 {
2610 wxASSERT(szUnConv);
2611
2612 CFStringRef theString = CFStringCreateWithBytes (
2613 NULL, //the allocator
2614 (const UInt8*)szUnConv,
2615 strlen(szUnConv),
2616 m_encoding,
2617 false //no BOM/external representation
2618 );
2619
2620 wxASSERT(theString);
2621
2622 size_t nOutLength = CFStringGetLength(theString);
2623
2624 if (szOut == NULL)
2625 {
2626 CFRelease(theString);
2627 return nOutLength;
2628 }
2629
2630 CFRange theRange = { 0, nOutSize };
2631
2632 #if SIZEOF_WCHAR_T == 4
2633 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2634 #endif
2635
2636 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2637
2638 CFRelease(theString);
2639
2640 szUniCharBuffer[nOutLength] = '\0';
2641
2642 #if SIZEOF_WCHAR_T == 4
2643 wxMBConvUTF16 converter;
2644 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2645 delete [] szUniCharBuffer;
2646 #endif
2647
2648 return nOutLength;
2649 }
2650
2651 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2652 {
2653 wxASSERT(szUnConv);
2654
2655 size_t nRealOutSize;
2656 size_t nBufSize = wxWcslen(szUnConv);
2657 UniChar* szUniBuffer = (UniChar*) szUnConv;
2658
2659 #if SIZEOF_WCHAR_T == 4
2660 wxMBConvUTF16 converter ;
2661 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2662 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2663 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2664 nBufSize /= sizeof(UniChar);
2665 #endif
2666
2667 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2668 NULL, //allocator
2669 szUniBuffer,
2670 nBufSize,
2671 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2672 );
2673
2674 wxASSERT(theString);
2675
2676 //Note that CER puts a BOM when converting to unicode
2677 //so we check and use getchars instead in that case
2678 if (m_encoding == kCFStringEncodingUnicode)
2679 {
2680 if (szOut != NULL)
2681 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2682
2683 nRealOutSize = CFStringGetLength(theString) + 1;
2684 }
2685 else
2686 {
2687 CFStringGetBytes(
2688 theString,
2689 CFRangeMake(0, CFStringGetLength(theString)),
2690 m_encoding,
2691 0, //what to put in characters that can't be converted -
2692 //0 tells CFString to return NULL if it meets such a character
2693 false, //not an external representation
2694 (UInt8*) szOut,
2695 nOutSize,
2696 (CFIndex*) &nRealOutSize
2697 );
2698 }
2699
2700 CFRelease(theString);
2701
2702 #if SIZEOF_WCHAR_T == 4
2703 delete[] szUniBuffer;
2704 #endif
2705
2706 return nRealOutSize - 1;
2707 }
2708
2709 virtual wxMBConv *Clone() const { return new wxMBConv_cf(*this); }
2710
2711 bool IsOk() const
2712 {
2713 return m_encoding != kCFStringEncodingInvalidId &&
2714 CFStringIsEncodingAvailable(m_encoding);
2715 }
2716
2717 private:
2718 CFStringEncoding m_encoding ;
2719 };
2720
2721 #endif // __DARWIN__
2722
2723 // ============================================================================
2724 // Mac conversion classes
2725 // ============================================================================
2726
2727 /* Although we are in the base library we currently have this wxMac
2728 * conditional. This is not generally good but fortunately does not affect
2729 * the ABI of the base library, only what encodings might work.
2730 * It does mean that a wxBase built as part of wxMac has slightly more support
2731 * than one built for wxCocoa or even wxGtk.
2732 */
2733 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2734
2735 class wxMBConv_mac : public wxMBConv
2736 {
2737 public:
2738 wxMBConv_mac()
2739 {
2740 Init(CFStringGetSystemEncoding()) ;
2741 }
2742
2743 wxMBConv_mac(const wxMBConv_mac& conv)
2744 {
2745 Init(conv.m_char_encoding);
2746 }
2747
2748 #if wxUSE_FONTMAP
2749 wxMBConv_mac(const char* name)
2750 {
2751 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2752 }
2753 #endif
2754
2755 wxMBConv_mac(wxFontEncoding encoding)
2756 {
2757 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2758 }
2759
2760 virtual ~wxMBConv_mac()
2761 {
2762 OSStatus status = noErr ;
2763 if (m_MB2WC_converter)
2764 status = TECDisposeConverter(m_MB2WC_converter);
2765 if (m_WC2MB_converter)
2766 status = TECDisposeConverter(m_WC2MB_converter);
2767 }
2768
2769 void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2770 TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2771 {
2772 m_MB2WC_converter = NULL ;
2773 m_WC2MB_converter = NULL ;
2774 m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2775 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2776 }
2777
2778 virtual void CreateIfNeeded() const
2779 {
2780 if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2781 {
2782 OSStatus status = noErr ;
2783 status = TECCreateConverter(&m_MB2WC_converter,
2784 m_char_encoding,
2785 m_unicode_encoding);
2786 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2787 status = TECCreateConverter(&m_WC2MB_converter,
2788 m_unicode_encoding,
2789 m_char_encoding);
2790 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2791 }
2792 }
2793
2794 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2795 {
2796 CreateIfNeeded() ;
2797 OSStatus status = noErr ;
2798 ByteCount byteOutLen ;
2799 ByteCount byteInLen = strlen(psz) + 1;
2800 wchar_t *tbuf = NULL ;
2801 UniChar* ubuf = NULL ;
2802 size_t res = 0 ;
2803
2804 if (buf == NULL)
2805 {
2806 // Apple specs say at least 32
2807 n = wxMax( 32, byteInLen ) ;
2808 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2809 }
2810
2811 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2812
2813 #if SIZEOF_WCHAR_T == 4
2814 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2815 #else
2816 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2817 #endif
2818
2819 status = TECConvertText(
2820 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2821 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2822
2823 #if SIZEOF_WCHAR_T == 4
2824 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2825 // is not properly terminated we get random characters at the end
2826 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2827 wxMBConvUTF16 converter ;
2828 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2829 free( ubuf ) ;
2830 #else
2831 res = byteOutLen / sizeof( UniChar ) ;
2832 #endif
2833
2834 if ( buf == NULL )
2835 free(tbuf) ;
2836
2837 if ( buf && res < n)
2838 buf[res] = 0;
2839
2840 return res ;
2841 }
2842
2843 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2844 {
2845 CreateIfNeeded() ;
2846 OSStatus status = noErr ;
2847 ByteCount byteOutLen ;
2848 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2849
2850 char *tbuf = NULL ;
2851
2852 if (buf == NULL)
2853 {
2854 // Apple specs say at least 32
2855 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2856 tbuf = (char*) malloc( n ) ;
2857 }
2858
2859 ByteCount byteBufferLen = n ;
2860 UniChar* ubuf = NULL ;
2861
2862 #if SIZEOF_WCHAR_T == 4
2863 wxMBConvUTF16 converter ;
2864 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2865 byteInLen = unicharlen ;
2866 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2867 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2868 #else
2869 ubuf = (UniChar*) psz ;
2870 #endif
2871
2872 status = TECConvertText(
2873 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2874 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2875
2876 #if SIZEOF_WCHAR_T == 4
2877 free( ubuf ) ;
2878 #endif
2879
2880 if ( buf == NULL )
2881 free(tbuf) ;
2882
2883 size_t res = byteOutLen ;
2884 if ( buf && res < n)
2885 {
2886 buf[res] = 0;
2887
2888 //we need to double-trip to verify it didn't insert any ? in place
2889 //of bogus characters
2890 wxWCharBuffer wcBuf(n);
2891 size_t pszlen = wxWcslen(psz);
2892 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2893 wxWcslen(wcBuf) != pszlen ||
2894 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2895 {
2896 // we didn't obtain the same thing we started from, hence
2897 // the conversion was lossy and we consider that it failed
2898 return wxCONV_FAILED;
2899 }
2900 }
2901
2902 return res ;
2903 }
2904
2905 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2906
2907 bool IsOk() const
2908 {
2909 CreateIfNeeded() ;
2910 return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2911 }
2912
2913 protected :
2914 mutable TECObjectRef m_MB2WC_converter;
2915 mutable TECObjectRef m_WC2MB_converter;
2916
2917 TextEncodingBase m_char_encoding;
2918 TextEncodingBase m_unicode_encoding;
2919 };
2920
2921 // MB is decomposed (D) normalized UTF8
2922
2923 class wxMBConv_macUTF8D : public wxMBConv_mac
2924 {
2925 public :
2926 wxMBConv_macUTF8D()
2927 {
2928 Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
2929 m_uni = NULL;
2930 m_uniBack = NULL ;
2931 }
2932
2933 virtual ~wxMBConv_macUTF8D()
2934 {
2935 if (m_uni!=NULL)
2936 DisposeUnicodeToTextInfo(&m_uni);
2937 if (m_uniBack!=NULL)
2938 DisposeUnicodeToTextInfo(&m_uniBack);
2939 }
2940
2941 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2942 {
2943 CreateIfNeeded() ;
2944 OSStatus status = noErr ;
2945 ByteCount byteOutLen ;
2946 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2947
2948 char *tbuf = NULL ;
2949
2950 if (buf == NULL)
2951 {
2952 // Apple specs say at least 32
2953 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2954 tbuf = (char*) malloc( n ) ;
2955 }
2956
2957 ByteCount byteBufferLen = n ;
2958 UniChar* ubuf = NULL ;
2959
2960 #if SIZEOF_WCHAR_T == 4
2961 wxMBConvUTF16 converter ;
2962 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2963 byteInLen = unicharlen ;
2964 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2965 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2966 #else
2967 ubuf = (UniChar*) psz ;
2968 #endif
2969
2970 // ubuf is a non-decomposed UniChar buffer
2971
2972 ByteCount dcubuflen = byteInLen * 2 + 2 ;
2973 ByteCount dcubufread , dcubufwritten ;
2974 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2975
2976 ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
2977 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , dcubuf ) ;
2978
2979 // we now convert that decomposed buffer into UTF8
2980
2981 status = TECConvertText(
2982 m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
2983 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2984
2985 free( dcubuf );
2986
2987 #if SIZEOF_WCHAR_T == 4
2988 free( ubuf ) ;
2989 #endif
2990
2991 if ( buf == NULL )
2992 free(tbuf) ;
2993
2994 size_t res = byteOutLen ;
2995 if ( buf && res < n)
2996 {
2997 buf[res] = 0;
2998 // don't test for round-trip fidelity yet, we cannot guarantee it yet
2999 }
3000
3001 return res ;
3002 }
3003
3004 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
3005 {
3006 CreateIfNeeded() ;
3007 OSStatus status = noErr ;
3008 ByteCount byteOutLen ;
3009 ByteCount byteInLen = strlen(psz) + 1;
3010 wchar_t *tbuf = NULL ;
3011 UniChar* ubuf = NULL ;
3012 size_t res = 0 ;
3013
3014 if (buf == NULL)
3015 {
3016 // Apple specs say at least 32
3017 n = wxMax( 32, byteInLen ) ;
3018 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
3019 }
3020
3021 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
3022
3023 #if SIZEOF_WCHAR_T == 4
3024 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
3025 #else
3026 ubuf = (UniChar*) (buf ? buf : tbuf) ;
3027 #endif
3028
3029 ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
3030 ByteCount dcubufread , dcubufwritten ;
3031 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3032
3033 status = TECConvertText(
3034 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
3035 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
3036 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3037 // is not properly terminated we get random characters at the end
3038 dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3039
3040 // now from the decomposed UniChar to properly composed uniChar
3041 ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
3042 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , ubuf ) ;
3043
3044 free( dcubuf );
3045 byteOutLen = dcubufwritten ;
3046 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3047
3048
3049 #if SIZEOF_WCHAR_T == 4
3050 wxMBConvUTF16 converter ;
3051 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
3052 free( ubuf ) ;
3053 #else
3054 res = byteOutLen / sizeof( UniChar ) ;
3055 #endif
3056
3057 if ( buf == NULL )
3058 free(tbuf) ;
3059
3060 if ( buf && res < n)
3061 buf[res] = 0;
3062
3063 return res ;
3064 }
3065
3066 virtual void CreateIfNeeded() const
3067 {
3068 wxMBConv_mac::CreateIfNeeded() ;
3069 if ( m_uni == NULL )
3070 {
3071 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3072 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3073 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3074 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
3075 m_map.mappingVersion = kUnicodeUseLatestMapping;
3076
3077 OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
3078 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3079
3080 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3081 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3082 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3083 kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
3084 m_map.mappingVersion = kUnicodeUseLatestMapping;
3085 err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
3086 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3087 }
3088 }
3089 protected :
3090 mutable UnicodeToTextInfo m_uni;
3091 mutable UnicodeToTextInfo m_uniBack;
3092 mutable UnicodeMapping m_map;
3093 };
3094 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3095
3096 // ============================================================================
3097 // wxEncodingConverter based conversion classes
3098 // ============================================================================
3099
3100 #if wxUSE_FONTMAP
3101
3102 class wxMBConv_wxwin : public wxMBConv
3103 {
3104 private:
3105 void Init()
3106 {
3107 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
3108 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
3109 }
3110
3111 public:
3112 // temporarily just use wxEncodingConverter stuff,
3113 // so that it works while a better implementation is built
3114 wxMBConv_wxwin(const char* name)
3115 {
3116 if (name)
3117 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3118 else
3119 m_enc = wxFONTENCODING_SYSTEM;
3120
3121 Init();
3122 }
3123
3124 wxMBConv_wxwin(wxFontEncoding enc)
3125 {
3126 m_enc = enc;
3127
3128 Init();
3129 }
3130
3131 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
3132 {
3133 size_t inbuf = strlen(psz);
3134 if (buf)
3135 {
3136 if (!m2w.Convert(psz, buf))
3137 return wxCONV_FAILED;
3138 }
3139 return inbuf;
3140 }
3141
3142 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
3143 {
3144 const size_t inbuf = wxWcslen(psz);
3145 if (buf)
3146 {
3147 if (!w2m.Convert(psz, buf))
3148 return wxCONV_FAILED;
3149 }
3150
3151 return inbuf;
3152 }
3153
3154 virtual size_t GetMBNulLen() const
3155 {
3156 switch ( m_enc )
3157 {
3158 case wxFONTENCODING_UTF16BE:
3159 case wxFONTENCODING_UTF16LE:
3160 return 2;
3161
3162 case wxFONTENCODING_UTF32BE:
3163 case wxFONTENCODING_UTF32LE:
3164 return 4;
3165
3166 default:
3167 return 1;
3168 }
3169 }
3170
3171 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
3172
3173 bool IsOk() const { return m_ok; }
3174
3175 public:
3176 wxFontEncoding m_enc;
3177 wxEncodingConverter m2w, w2m;
3178
3179 private:
3180 // were we initialized successfully?
3181 bool m_ok;
3182
3183 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
3184 };
3185
3186 // make the constructors available for unit testing
3187 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
3188 {
3189 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
3190 if ( !result->IsOk() )
3191 {
3192 delete result;
3193 return 0;
3194 }
3195
3196 return result;
3197 }
3198
3199 #endif // wxUSE_FONTMAP
3200
3201 // ============================================================================
3202 // wxCSConv implementation
3203 // ============================================================================
3204
3205 void wxCSConv::Init()
3206 {
3207 m_name = NULL;
3208 m_convReal = NULL;
3209 m_deferred = true;
3210 }
3211
3212 wxCSConv::wxCSConv(const wxString& charset)
3213 {
3214 Init();
3215
3216 if ( !charset.empty() )
3217 {
3218 SetName(charset.ToAscii());
3219 }
3220
3221 #if wxUSE_FONTMAP
3222 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3223 #else
3224 m_encoding = wxFONTENCODING_SYSTEM;
3225 #endif
3226 }
3227
3228 wxCSConv::wxCSConv(wxFontEncoding encoding)
3229 {
3230 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3231 {
3232 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3233
3234 encoding = wxFONTENCODING_SYSTEM;
3235 }
3236
3237 Init();
3238
3239 m_encoding = encoding;
3240 }
3241
3242 wxCSConv::~wxCSConv()
3243 {
3244 Clear();
3245 }
3246
3247 wxCSConv::wxCSConv(const wxCSConv& conv)
3248 : wxMBConv()
3249 {
3250 Init();
3251
3252 SetName(conv.m_name);
3253 m_encoding = conv.m_encoding;
3254 }
3255
3256 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3257 {
3258 Clear();
3259
3260 SetName(conv.m_name);
3261 m_encoding = conv.m_encoding;
3262
3263 return *this;
3264 }
3265
3266 void wxCSConv::Clear()
3267 {
3268 free(m_name);
3269 delete m_convReal;
3270
3271 m_name = NULL;
3272 m_convReal = NULL;
3273 }
3274
3275 void wxCSConv::SetName(const char *charset)
3276 {
3277 if (charset)
3278 {
3279 m_name = strdup(charset);
3280 m_deferred = true;
3281 }
3282 }
3283
3284 #if wxUSE_FONTMAP
3285
3286 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3287 wxEncodingNameCache );
3288
3289 static wxEncodingNameCache gs_nameCache;
3290 #endif
3291
3292 wxMBConv *wxCSConv::DoCreate() const
3293 {
3294 #if wxUSE_FONTMAP
3295 wxLogTrace(TRACE_STRCONV,
3296 wxT("creating conversion for %s"),
3297 (m_name ? m_name
3298 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3299 #endif // wxUSE_FONTMAP
3300
3301 // check for the special case of ASCII or ISO8859-1 charset: as we have
3302 // special knowledge of it anyhow, we don't need to create a special
3303 // conversion object
3304 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3305 m_encoding == wxFONTENCODING_DEFAULT )
3306 {
3307 // don't convert at all
3308 return NULL;
3309 }
3310
3311 // we trust OS to do conversion better than we can so try external
3312 // conversion methods first
3313 //
3314 // the full order is:
3315 // 1. OS conversion (iconv() under Unix or Win32 API)
3316 // 2. hard coded conversions for UTF
3317 // 3. wxEncodingConverter as fall back
3318
3319 // step (1)
3320 #ifdef HAVE_ICONV
3321 #if !wxUSE_FONTMAP
3322 if ( m_name )
3323 #endif // !wxUSE_FONTMAP
3324 {
3325 #if wxUSE_FONTMAP
3326 wxFontEncoding encoding(m_encoding);
3327 #endif
3328
3329 if ( m_name )
3330 {
3331 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3332 if ( conv->IsOk() )
3333 return conv;
3334
3335 delete conv;
3336
3337 #if wxUSE_FONTMAP
3338 encoding =
3339 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3340 #endif // wxUSE_FONTMAP
3341 }
3342 #if wxUSE_FONTMAP
3343 {
3344 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3345 if ( it != gs_nameCache.end() )
3346 {
3347 if ( it->second.empty() )
3348 return NULL;
3349
3350 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3351 if ( conv->IsOk() )
3352 return conv;
3353
3354 delete conv;
3355 }
3356
3357 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3358 // CS : in case this does not return valid names (eg for MacRoman)
3359 // encoding got a 'failure' entry in the cache all the same,
3360 // although it just has to be created using a different method, so
3361 // only store failed iconv creation attempts (or perhaps we
3362 // shoulnd't do this at all ?)
3363 if ( names[0] != NULL )
3364 {
3365 for ( ; *names; ++names )
3366 {
3367 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3368 // will need changes that will obsolete this
3369 wxString name(*names);
3370 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3371 if ( conv->IsOk() )
3372 {
3373 gs_nameCache[encoding] = *names;
3374 return conv;
3375 }
3376
3377 delete conv;
3378 }
3379
3380 gs_nameCache[encoding] = _T(""); // cache the failure
3381 }
3382 }
3383 #endif // wxUSE_FONTMAP
3384 }
3385 #endif // HAVE_ICONV
3386
3387 #ifdef wxHAVE_WIN32_MB2WC
3388 {
3389 #if wxUSE_FONTMAP
3390 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3391 : new wxMBConv_win32(m_encoding);
3392 if ( conv->IsOk() )
3393 return conv;
3394
3395 delete conv;
3396 #else
3397 return NULL;
3398 #endif
3399 }
3400 #endif // wxHAVE_WIN32_MB2WC
3401
3402 #if defined(__WXMAC__)
3403 {
3404 // leave UTF16 and UTF32 to the built-ins of wx
3405 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3406 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3407 {
3408 #if wxUSE_FONTMAP
3409 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3410 : new wxMBConv_mac(m_encoding);
3411 #else
3412 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3413 #endif
3414 if ( conv->IsOk() )
3415 return conv;
3416
3417 delete conv;
3418 }
3419 }
3420 #endif
3421
3422 #ifdef __DARWIN__
3423 {
3424 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3425 {
3426 #if wxUSE_FONTMAP
3427 wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3428 : new wxMBConv_cf(m_encoding);
3429 #else
3430 wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3431 #endif
3432
3433 if ( conv->IsOk() )
3434 return conv;
3435
3436 delete conv;
3437 }
3438 }
3439 #endif // __DARWIN__
3440
3441 // step (2)
3442 wxFontEncoding enc = m_encoding;
3443 #if wxUSE_FONTMAP
3444 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3445 {
3446 // use "false" to suppress interactive dialogs -- we can be called from
3447 // anywhere and popping up a dialog from here is the last thing we want to
3448 // do
3449 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3450 }
3451 #endif // wxUSE_FONTMAP
3452
3453 switch ( enc )
3454 {
3455 case wxFONTENCODING_UTF7:
3456 return new wxMBConvUTF7;
3457
3458 case wxFONTENCODING_UTF8:
3459 return new wxMBConvUTF8;
3460
3461 case wxFONTENCODING_UTF16BE:
3462 return new wxMBConvUTF16BE;
3463
3464 case wxFONTENCODING_UTF16LE:
3465 return new wxMBConvUTF16LE;
3466
3467 case wxFONTENCODING_UTF32BE:
3468 return new wxMBConvUTF32BE;
3469
3470 case wxFONTENCODING_UTF32LE:
3471 return new wxMBConvUTF32LE;
3472
3473 default:
3474 // nothing to do but put here to suppress gcc warnings
3475 break;
3476 }
3477
3478 // step (3)
3479 #if wxUSE_FONTMAP
3480 {
3481 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3482 : new wxMBConv_wxwin(m_encoding);
3483 if ( conv->IsOk() )
3484 return conv;
3485
3486 delete conv;
3487 }
3488 #endif // wxUSE_FONTMAP
3489
3490 // NB: This is a hack to prevent deadlock. What could otherwise happen
3491 // in Unicode build: wxConvLocal creation ends up being here
3492 // because of some failure and logs the error. But wxLog will try to
3493 // attach a timestamp, for which it will need wxConvLocal (to convert
3494 // time to char* and then wchar_t*), but that fails, tries to log the
3495 // error, but wxLog has an (already locked) critical section that
3496 // guards the static buffer.
3497 static bool alreadyLoggingError = false;
3498 if (!alreadyLoggingError)
3499 {
3500 alreadyLoggingError = true;
3501 wxLogError(_("Cannot convert from the charset '%s'!"),
3502 m_name ? m_name
3503 :
3504 #if wxUSE_FONTMAP
3505 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3506 #else // !wxUSE_FONTMAP
3507 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3508 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3509 );
3510
3511 alreadyLoggingError = false;
3512 }
3513
3514 return NULL;
3515 }
3516
3517 void wxCSConv::CreateConvIfNeeded() const
3518 {
3519 if ( m_deferred )
3520 {
3521 wxCSConv *self = (wxCSConv *)this; // const_cast
3522
3523 // if we don't have neither the name nor the encoding, use the default
3524 // encoding for this system
3525 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3526 {
3527 #if wxUSE_INTL
3528 self->m_encoding = wxLocale::GetSystemEncoding();
3529 #else
3530 // fallback to some reasonable default:
3531 self->m_encoding = wxFONTENCODING_ISO8859_1;
3532 #endif // wxUSE_INTL
3533 }
3534
3535 self->m_convReal = DoCreate();
3536 self->m_deferred = false;
3537 }
3538 }
3539
3540 bool wxCSConv::IsOk() const
3541 {
3542 CreateConvIfNeeded();
3543
3544 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3545 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3546 return true; // always ok as we do it ourselves
3547
3548 // m_convReal->IsOk() is called at its own creation, so we know it must
3549 // be ok if m_convReal is non-NULL
3550 return m_convReal != NULL;
3551 }
3552
3553 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3554 const char *src, size_t srcLen) const
3555 {
3556 CreateConvIfNeeded();
3557
3558 if (m_convReal)
3559 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3560
3561 // latin-1 (direct)
3562 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3563 }
3564
3565 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3566 const wchar_t *src, size_t srcLen) const
3567 {
3568 CreateConvIfNeeded();
3569
3570 if (m_convReal)
3571 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3572
3573 // latin-1 (direct)
3574 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3575 }
3576
3577 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3578 {
3579 CreateConvIfNeeded();
3580
3581 if (m_convReal)
3582 return m_convReal->MB2WC(buf, psz, n);
3583
3584 // latin-1 (direct)
3585 size_t len = strlen(psz);
3586
3587 if (buf)
3588 {
3589 for (size_t c = 0; c <= len; c++)
3590 buf[c] = (unsigned char)(psz[c]);
3591 }
3592
3593 return len;
3594 }
3595
3596 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3597 {
3598 CreateConvIfNeeded();
3599
3600 if (m_convReal)
3601 return m_convReal->WC2MB(buf, psz, n);
3602
3603 // latin-1 (direct)
3604 const size_t len = wxWcslen(psz);
3605 if (buf)
3606 {
3607 for (size_t c = 0; c <= len; c++)
3608 {
3609 if (psz[c] > 0xFF)
3610 return wxCONV_FAILED;
3611
3612 buf[c] = (char)psz[c];
3613 }
3614 }
3615 else
3616 {
3617 for (size_t c = 0; c <= len; c++)
3618 {
3619 if (psz[c] > 0xFF)
3620 return wxCONV_FAILED;
3621 }
3622 }
3623
3624 return len;
3625 }
3626
3627 size_t wxCSConv::GetMBNulLen() const
3628 {
3629 CreateConvIfNeeded();
3630
3631 if ( m_convReal )
3632 {
3633 return m_convReal->GetMBNulLen();
3634 }
3635
3636 // otherwise, we are ISO-8859-1
3637 return 1;
3638 }
3639
3640 #if wxUSE_UNICODE_UTF8
3641 bool wxCSConv::IsUTF8() const
3642 {
3643 CreateConvIfNeeded();
3644
3645 if ( m_convReal )
3646 {
3647 return m_convReal->IsUTF8();
3648 }
3649
3650 // otherwise, we are ISO-8859-1
3651 return false;
3652 }
3653 #endif
3654
3655
3656 #if wxUSE_UNICODE
3657
3658 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3659 {
3660 if ( !s )
3661 return wxWCharBuffer();
3662
3663 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3664 if ( !wbuf )
3665 wbuf = wxMBConvUTF8().cMB2WX(s);
3666 if ( !wbuf )
3667 wbuf = wxConvISO8859_1.cMB2WX(s);
3668
3669 return wbuf;
3670 }
3671
3672 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3673 {
3674 if ( !ws )
3675 return wxCharBuffer();
3676
3677 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3678 if ( !buf )
3679 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3680
3681 return buf;
3682 }
3683
3684 #endif // wxUSE_UNICODE
3685
3686 // ----------------------------------------------------------------------------
3687 // globals
3688 // ----------------------------------------------------------------------------
3689
3690 // NB: The reason why we create converted objects in this convoluted way,
3691 // using a factory function instead of global variable, is that they
3692 // may be used at static initialization time (some of them are used by
3693 // wxString ctors and there may be a global wxString object). In other
3694 // words, possibly _before_ the converter global object would be
3695 // initialized.
3696
3697 #undef wxConvLibc
3698 #undef wxConvUTF8
3699 #undef wxConvUTF7
3700 #undef wxConvLocal
3701 #undef wxConvISO8859_1
3702
3703 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3704 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3705 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3706 { \
3707 static impl_klass name##Obj ctor_args; \
3708 return &name##Obj; \
3709 } \
3710 /* this ensures that all global converter objects are created */ \
3711 /* by the time static initialization is done, i.e. before any */ \
3712 /* thread is launched: */ \
3713 static klass* gs_##name##instance = wxGet_##name##Ptr()
3714
3715 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3716 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3717
3718 #ifdef __WINDOWS__
3719 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3720 #elif defined(__WXMAC__) && !defined(__MACH__)
3721 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_mac, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3722 #else
3723 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3724 #endif
3725
3726 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3727 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3728
3729 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3730 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3731
3732 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3733 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3734
3735 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3736 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3737 #endif
3738 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3739 #ifdef __WXOSX__
3740 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3741 &wxConvMacUTF8DObj;
3742 #else
3743 wxGet_wxConvUTF8Ptr();
3744 #endif
3745 #else // !__WXOSX__
3746 wxGet_wxConvLibcPtr();
3747 #endif // __WXOSX__/!__WXOSX__
3748
3749 #else // !wxUSE_WCHAR_T
3750
3751 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3752 // stand-ins in absence of wchar_t
3753 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3754 wxConvISO8859_1,
3755 wxConvLocal,
3756 wxConvUTF8;
3757
3758 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T