]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
Fix wxCocoa compilation.
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifdef __BORLANDC__
19 #pragma hdrstop
20 #endif //__BORLANDC__
21
22 #ifndef WX_PRECOMP
23 #include "wx/intl.h"
24 #include "wx/log.h"
25 #include "wx/utils.h"
26 #include "wx/hashmap.h"
27 #endif
28
29 #include "wx/strconv.h"
30
31 #if wxUSE_WCHAR_T
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
45 #endif
46
47 #ifdef __SALFORDC__
48 #include <clib.h>
49 #endif
50
51 #ifdef HAVE_ICONV
52 #include <iconv.h>
53 #include "wx/thread.h"
54 #endif
55
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
58
59 #ifdef __WXMAC__
60 #ifndef __DARWIN__
61 #include <ATSUnicode.h>
62 #include <TextCommon.h>
63 #include <TextEncodingConverter.h>
64 #endif
65
66 // includes Mac headers
67 #include "wx/mac/private.h"
68 #endif
69
70
71 #define TRACE_STRCONV _T("strconv")
72
73 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
74 // be 4 bytes
75 #if SIZEOF_WCHAR_T == 2
76 #define WC_UTF16
77 #endif
78
79
80 // ============================================================================
81 // implementation
82 // ============================================================================
83
84 // helper function of cMB2WC(): check if n bytes at this location are all NUL
85 static bool NotAllNULs(const char *p, size_t n)
86 {
87 while ( n && *p++ == '\0' )
88 n--;
89
90 return n != 0;
91 }
92
93 // ----------------------------------------------------------------------------
94 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
95 // ----------------------------------------------------------------------------
96
97 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
98 {
99 if (input <= 0xffff)
100 {
101 if (output)
102 *output = (wxUint16) input;
103
104 return 1;
105 }
106 else if (input >= 0x110000)
107 {
108 return wxCONV_FAILED;
109 }
110 else
111 {
112 if (output)
113 {
114 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
115 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
116 }
117
118 return 2;
119 }
120 }
121
122 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
123 {
124 if ((*input < 0xd800) || (*input > 0xdfff))
125 {
126 output = *input;
127 return 1;
128 }
129 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
130 {
131 output = *input;
132 return wxCONV_FAILED;
133 }
134 else
135 {
136 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
137 return 2;
138 }
139 }
140
141 #ifdef WC_UTF16
142 typedef wchar_t wxDecodeSurrogate_t;
143 #else // !WC_UTF16
144 typedef wxUint16 wxDecodeSurrogate_t;
145 #endif // WC_UTF16/!WC_UTF16
146
147 // returns the next UTF-32 character from the wchar_t buffer and advances the
148 // pointer to the character after this one
149 //
150 // if an invalid character is found, *pSrc is set to NULL, the caller must
151 // check for this
152 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
153 {
154 wxUint32 out;
155 const size_t
156 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
157 if ( n == wxCONV_FAILED )
158 *pSrc = NULL;
159 else
160 *pSrc += n;
161
162 return out;
163 }
164
165 // ----------------------------------------------------------------------------
166 // wxMBConv
167 // ----------------------------------------------------------------------------
168
169 size_t
170 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
171 const char *src, size_t srcLen) const
172 {
173 // although new conversion classes are supposed to implement this function
174 // directly, the existins ones only implement the old MB2WC() and so, to
175 // avoid to have to rewrite all conversion classes at once, we provide a
176 // default (but not efficient) implementation of this one in terms of the
177 // old function by copying the input to ensure that it's NUL-terminated and
178 // then using MB2WC() to convert it
179
180 // the number of chars [which would be] written to dst [if it were not NULL]
181 size_t dstWritten = 0;
182
183 // the number of NULs terminating this string
184 size_t nulLen = 0; // not really needed, but just to avoid warnings
185
186 // if we were not given the input size we just have to assume that the
187 // string is properly terminated as we have no way of knowing how long it
188 // is anyhow, but if we do have the size check whether there are enough
189 // NULs at the end
190 wxCharBuffer bufTmp;
191 const char *srcEnd;
192 if ( srcLen != wxNO_LEN )
193 {
194 // we need to know how to find the end of this string
195 nulLen = GetMBNulLen();
196 if ( nulLen == wxCONV_FAILED )
197 return wxCONV_FAILED;
198
199 // if there are enough NULs we can avoid the copy
200 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
201 {
202 // make a copy in order to properly NUL-terminate the string
203 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
204 char * const p = bufTmp.data();
205 memcpy(p, src, srcLen);
206 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
207 *s = '\0';
208
209 src = bufTmp;
210 }
211
212 srcEnd = src + srcLen;
213 }
214 else // quit after the first loop iteration
215 {
216 srcEnd = NULL;
217 }
218
219 for ( ;; )
220 {
221 // try to convert the current chunk
222 size_t lenChunk = MB2WC(NULL, src, 0);
223 if ( lenChunk == wxCONV_FAILED )
224 return wxCONV_FAILED;
225
226 lenChunk++; // for the L'\0' at the end of this chunk
227
228 dstWritten += lenChunk;
229
230 if ( lenChunk == 1 )
231 {
232 // nothing left in the input string, conversion succeeded
233 break;
234 }
235
236 if ( dst )
237 {
238 if ( dstWritten > dstLen )
239 return wxCONV_FAILED;
240
241 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
242 return wxCONV_FAILED;
243
244 dst += lenChunk;
245 }
246
247 if ( !srcEnd )
248 {
249 // we convert just one chunk in this case as this is the entire
250 // string anyhow
251 break;
252 }
253
254 // advance the input pointer past the end of this chunk
255 while ( NotAllNULs(src, nulLen) )
256 {
257 // notice that we must skip over multiple bytes here as we suppose
258 // that if NUL takes 2 or 4 bytes, then all the other characters do
259 // too and so if advanced by a single byte we might erroneously
260 // detect sequences of NUL bytes in the middle of the input
261 src += nulLen;
262 }
263
264 src += nulLen; // skipping over its terminator as well
265
266 // note that ">=" (and not just "==") is needed here as the terminator
267 // we skipped just above could be inside or just after the buffer
268 // delimited by inEnd
269 if ( src >= srcEnd )
270 break;
271 }
272
273 return dstWritten;
274 }
275
276 size_t
277 wxMBConv::FromWChar(char *dst, size_t dstLen,
278 const wchar_t *src, size_t srcLen) const
279 {
280 // the number of chars [which would be] written to dst [if it were not NULL]
281 size_t dstWritten = 0;
282
283 // make a copy of the input string unless it is already properly
284 // NUL-terminated
285 //
286 // if we don't know its length we have no choice but to assume that it is,
287 // indeed, properly terminated
288 wxWCharBuffer bufTmp;
289 if ( srcLen == wxNO_LEN )
290 {
291 srcLen = wxWcslen(src) + 1;
292 }
293 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
294 {
295 // make a copy in order to properly NUL-terminate the string
296 bufTmp = wxWCharBuffer(srcLen);
297 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
298 src = bufTmp;
299 }
300
301 const size_t lenNul = GetMBNulLen();
302 for ( const wchar_t * const srcEnd = src + srcLen;
303 src < srcEnd;
304 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
305 {
306 // try to convert the current chunk
307 size_t lenChunk = WC2MB(NULL, src, 0);
308
309 if ( lenChunk == wxCONV_FAILED )
310 return wxCONV_FAILED;
311
312 lenChunk += lenNul;
313 dstWritten += lenChunk;
314
315 if ( dst )
316 {
317 if ( dstWritten > dstLen )
318 return wxCONV_FAILED;
319
320 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
321 return wxCONV_FAILED;
322
323 dst += lenChunk;
324 }
325 }
326
327 return dstWritten;
328 }
329
330 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
331 {
332 size_t rc = ToWChar(outBuff, outLen, inBuff);
333 if ( rc != wxCONV_FAILED )
334 {
335 // ToWChar() returns the buffer length, i.e. including the trailing
336 // NUL, while this method doesn't take it into account
337 rc--;
338 }
339
340 return rc;
341 }
342
343 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
344 {
345 size_t rc = FromWChar(outBuff, outLen, inBuff);
346 if ( rc != wxCONV_FAILED )
347 {
348 rc -= GetMBNulLen();
349 }
350
351 return rc;
352 }
353
354 wxMBConv::~wxMBConv()
355 {
356 // nothing to do here (necessary for Darwin linking probably)
357 }
358
359 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
360 {
361 if ( psz )
362 {
363 // calculate the length of the buffer needed first
364 const size_t nLen = MB2WC(NULL, psz, 0);
365 if ( nLen != wxCONV_FAILED )
366 {
367 // now do the actual conversion
368 wxWCharBuffer buf(nLen /* +1 added implicitly */);
369
370 // +1 for the trailing NULL
371 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
372 return buf;
373 }
374 }
375
376 return wxWCharBuffer();
377 }
378
379 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
380 {
381 if ( pwz )
382 {
383 const size_t nLen = WC2MB(NULL, pwz, 0);
384 if ( nLen != wxCONV_FAILED )
385 {
386 // extra space for trailing NUL(s)
387 static const size_t extraLen = GetMaxMBNulLen();
388
389 wxCharBuffer buf(nLen + extraLen - 1);
390 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
391 return buf;
392 }
393 }
394
395 return wxCharBuffer();
396 }
397
398 const wxWCharBuffer
399 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
400 {
401 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
402 if ( dstLen != wxCONV_FAILED )
403 {
404 wxWCharBuffer wbuf(dstLen - 1);
405 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
406 {
407 if ( outLen )
408 {
409 *outLen = dstLen;
410 if ( wbuf[dstLen - 1] == L'\0' )
411 (*outLen)--;
412 }
413
414 return wbuf;
415 }
416 }
417
418 if ( outLen )
419 *outLen = 0;
420
421 return wxWCharBuffer();
422 }
423
424 const wxCharBuffer
425 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
426 {
427 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
428 if ( dstLen != wxCONV_FAILED )
429 {
430 // special case of empty input: can't allocate 0 size buffer below as
431 // wxCharBuffer insists on NUL-terminating it
432 wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
433 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
434 {
435 if ( outLen )
436 {
437 *outLen = dstLen;
438
439 const size_t nulLen = GetMBNulLen();
440 if ( dstLen >= nulLen &&
441 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
442 {
443 // in this case the output is NUL-terminated and we're not
444 // supposed to count NUL
445 *outLen -= nulLen;
446 }
447 }
448
449 return buf;
450 }
451 }
452
453 if ( outLen )
454 *outLen = 0;
455
456 return wxCharBuffer();
457 }
458
459 // ----------------------------------------------------------------------------
460 // wxMBConvLibc
461 // ----------------------------------------------------------------------------
462
463 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
464 {
465 return wxMB2WC(buf, psz, n);
466 }
467
468 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
469 {
470 return wxWC2MB(buf, psz, n);
471 }
472
473 // ----------------------------------------------------------------------------
474 // wxConvBrokenFileNames
475 // ----------------------------------------------------------------------------
476
477 #ifdef __UNIX__
478
479 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
480 {
481 if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
482 wxStricmp(charset, _T("UTF8")) == 0 )
483 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
484 else
485 m_conv = new wxCSConv(charset);
486 }
487
488 #endif // __UNIX__
489
490 // ----------------------------------------------------------------------------
491 // UTF-7
492 // ----------------------------------------------------------------------------
493
494 // Implementation (C) 2004 Fredrik Roubert
495
496 //
497 // BASE64 decoding table
498 //
499 static const unsigned char utf7unb64[] =
500 {
501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
506 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
507 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
508 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
510 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
511 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
512 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
514 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
515 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
516 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
533 };
534
535 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
536 {
537 size_t len = 0;
538
539 while ( *psz && (!buf || (len < n)) )
540 {
541 unsigned char cc = *psz++;
542 if (cc != '+')
543 {
544 // plain ASCII char
545 if (buf)
546 *buf++ = cc;
547 len++;
548 }
549 else if (*psz == '-')
550 {
551 // encoded plus sign
552 if (buf)
553 *buf++ = cc;
554 len++;
555 psz++;
556 }
557 else // start of BASE64 encoded string
558 {
559 bool lsb, ok;
560 unsigned int d, l;
561 for ( ok = lsb = false, d = 0, l = 0;
562 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
563 psz++ )
564 {
565 d <<= 6;
566 d += cc;
567 for (l += 6; l >= 8; lsb = !lsb)
568 {
569 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
570 if (lsb)
571 {
572 if (buf)
573 *buf++ |= c;
574 len ++;
575 }
576 else
577 {
578 if (buf)
579 *buf = (wchar_t)(c << 8);
580 }
581
582 ok = true;
583 }
584 }
585
586 if ( !ok )
587 {
588 // in valid UTF7 we should have valid characters after '+'
589 return wxCONV_FAILED;
590 }
591
592 if (*psz == '-')
593 psz++;
594 }
595 }
596
597 if ( buf && (len < n) )
598 *buf = '\0';
599
600 return len;
601 }
602
603 //
604 // BASE64 encoding table
605 //
606 static const unsigned char utf7enb64[] =
607 {
608 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
609 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
610 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
611 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
612 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
613 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
614 'w', 'x', 'y', 'z', '0', '1', '2', '3',
615 '4', '5', '6', '7', '8', '9', '+', '/'
616 };
617
618 //
619 // UTF-7 encoding table
620 //
621 // 0 - Set D (directly encoded characters)
622 // 1 - Set O (optional direct characters)
623 // 2 - whitespace characters (optional)
624 // 3 - special characters
625 //
626 static const unsigned char utf7encode[128] =
627 {
628 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
629 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
630 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
632 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
634 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
635 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
636 };
637
638 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
639 {
640 size_t len = 0;
641
642 while (*psz && ((!buf) || (len < n)))
643 {
644 wchar_t cc = *psz++;
645 if (cc < 0x80 && utf7encode[cc] < 1)
646 {
647 // plain ASCII char
648 if (buf)
649 *buf++ = (char)cc;
650
651 len++;
652 }
653 #ifndef WC_UTF16
654 else if (((wxUint32)cc) > 0xffff)
655 {
656 // no surrogate pair generation (yet?)
657 return wxCONV_FAILED;
658 }
659 #endif
660 else
661 {
662 if (buf)
663 *buf++ = '+';
664
665 len++;
666 if (cc != '+')
667 {
668 // BASE64 encode string
669 unsigned int lsb, d, l;
670 for (d = 0, l = 0; /*nothing*/; psz++)
671 {
672 for (lsb = 0; lsb < 2; lsb ++)
673 {
674 d <<= 8;
675 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
676
677 for (l += 8; l >= 6; )
678 {
679 l -= 6;
680 if (buf)
681 *buf++ = utf7enb64[(d >> l) % 64];
682 len++;
683 }
684 }
685
686 cc = *psz;
687 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
688 break;
689 }
690
691 if (l != 0)
692 {
693 if (buf)
694 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
695
696 len++;
697 }
698 }
699
700 if (buf)
701 *buf++ = '-';
702 len++;
703 }
704 }
705
706 if (buf && (len < n))
707 *buf = 0;
708
709 return len;
710 }
711
712 // ----------------------------------------------------------------------------
713 // UTF-8
714 // ----------------------------------------------------------------------------
715
716 static wxUint32 utf8_max[]=
717 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
718
719 // boundaries of the private use area we use to (temporarily) remap invalid
720 // characters invalid in a UTF-8 encoded string
721 const wxUint32 wxUnicodePUA = 0x100000;
722 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
723
724 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
725 {
726 size_t len = 0;
727
728 while (*psz && ((!buf) || (len < n)))
729 {
730 const char *opsz = psz;
731 bool invalid = false;
732 unsigned char cc = *psz++, fc = cc;
733 unsigned cnt;
734 for (cnt = 0; fc & 0x80; cnt++)
735 fc <<= 1;
736
737 if (!cnt)
738 {
739 // plain ASCII char
740 if (buf)
741 *buf++ = cc;
742 len++;
743
744 // escape the escape character for octal escapes
745 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
746 && cc == '\\' && (!buf || len < n))
747 {
748 if (buf)
749 *buf++ = cc;
750 len++;
751 }
752 }
753 else
754 {
755 cnt--;
756 if (!cnt)
757 {
758 // invalid UTF-8 sequence
759 invalid = true;
760 }
761 else
762 {
763 unsigned ocnt = cnt - 1;
764 wxUint32 res = cc & (0x3f >> cnt);
765 while (cnt--)
766 {
767 cc = *psz;
768 if ((cc & 0xC0) != 0x80)
769 {
770 // invalid UTF-8 sequence
771 invalid = true;
772 break;
773 }
774
775 psz++;
776 res = (res << 6) | (cc & 0x3f);
777 }
778
779 if (invalid || res <= utf8_max[ocnt])
780 {
781 // illegal UTF-8 encoding
782 invalid = true;
783 }
784 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
785 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
786 {
787 // if one of our PUA characters turns up externally
788 // it must also be treated as an illegal sequence
789 // (a bit like you have to escape an escape character)
790 invalid = true;
791 }
792 else
793 {
794 #ifdef WC_UTF16
795 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
796 size_t pa = encode_utf16(res, (wxUint16 *)buf);
797 if (pa == wxCONV_FAILED)
798 {
799 invalid = true;
800 }
801 else
802 {
803 if (buf)
804 buf += pa;
805 len += pa;
806 }
807 #else // !WC_UTF16
808 if (buf)
809 *buf++ = (wchar_t)res;
810 len++;
811 #endif // WC_UTF16/!WC_UTF16
812 }
813 }
814
815 if (invalid)
816 {
817 if (m_options & MAP_INVALID_UTF8_TO_PUA)
818 {
819 while (opsz < psz && (!buf || len < n))
820 {
821 #ifdef WC_UTF16
822 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
823 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
824 wxASSERT(pa != wxCONV_FAILED);
825 if (buf)
826 buf += pa;
827 opsz++;
828 len += pa;
829 #else
830 if (buf)
831 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
832 opsz++;
833 len++;
834 #endif
835 }
836 }
837 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
838 {
839 while (opsz < psz && (!buf || len < n))
840 {
841 if ( buf && len + 3 < n )
842 {
843 unsigned char on = *opsz;
844 *buf++ = L'\\';
845 *buf++ = (wchar_t)( L'0' + on / 0100 );
846 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
847 *buf++ = (wchar_t)( L'0' + on % 010 );
848 }
849
850 opsz++;
851 len += 4;
852 }
853 }
854 else // MAP_INVALID_UTF8_NOT
855 {
856 return wxCONV_FAILED;
857 }
858 }
859 }
860 }
861
862 if (buf && (len < n))
863 *buf = 0;
864
865 return len;
866 }
867
868 static inline bool isoctal(wchar_t wch)
869 {
870 return L'0' <= wch && wch <= L'7';
871 }
872
873 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
874 {
875 size_t len = 0;
876
877 while (*psz && ((!buf) || (len < n)))
878 {
879 wxUint32 cc;
880
881 #ifdef WC_UTF16
882 // cast is ok for WC_UTF16
883 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
884 psz += (pa == wxCONV_FAILED) ? 1 : pa;
885 #else
886 cc = (*psz++) & 0x7fffffff;
887 #endif
888
889 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
890 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
891 {
892 if (buf)
893 *buf++ = (char)(cc - wxUnicodePUA);
894 len++;
895 }
896 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
897 && cc == L'\\' && psz[0] == L'\\' )
898 {
899 if (buf)
900 *buf++ = (char)cc;
901 psz++;
902 len++;
903 }
904 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
905 cc == L'\\' &&
906 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
907 {
908 if (buf)
909 {
910 *buf++ = (char) ((psz[0] - L'0') * 0100 +
911 (psz[1] - L'0') * 010 +
912 (psz[2] - L'0'));
913 }
914
915 psz += 3;
916 len++;
917 }
918 else
919 {
920 unsigned cnt;
921 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
922 {
923 }
924
925 if (!cnt)
926 {
927 // plain ASCII char
928 if (buf)
929 *buf++ = (char) cc;
930 len++;
931 }
932 else
933 {
934 len += cnt + 1;
935 if (buf)
936 {
937 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
938 while (cnt--)
939 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
940 }
941 }
942 }
943 }
944
945 if (buf && (len < n))
946 *buf = 0;
947
948 return len;
949 }
950
951 // ============================================================================
952 // UTF-16
953 // ============================================================================
954
955 #ifdef WORDS_BIGENDIAN
956 #define wxMBConvUTF16straight wxMBConvUTF16BE
957 #define wxMBConvUTF16swap wxMBConvUTF16LE
958 #else
959 #define wxMBConvUTF16swap wxMBConvUTF16BE
960 #define wxMBConvUTF16straight wxMBConvUTF16LE
961 #endif
962
963 /* static */
964 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
965 {
966 if ( srcLen == wxNO_LEN )
967 {
968 // count the number of bytes in input, including the trailing NULs
969 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
970 for ( srcLen = 1; *inBuff++; srcLen++ )
971 ;
972
973 srcLen *= BYTES_PER_CHAR;
974 }
975 else // we already have the length
976 {
977 // we can only convert an entire number of UTF-16 characters
978 if ( srcLen % BYTES_PER_CHAR )
979 return wxCONV_FAILED;
980 }
981
982 return srcLen;
983 }
984
985 // case when in-memory representation is UTF-16 too
986 #ifdef WC_UTF16
987
988 // ----------------------------------------------------------------------------
989 // conversions without endianness change
990 // ----------------------------------------------------------------------------
991
992 size_t
993 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
994 const char *src, size_t srcLen) const
995 {
996 // set up the scene for using memcpy() (which is presumably more efficient
997 // than copying the bytes one by one)
998 srcLen = GetLength(src, srcLen);
999 if ( srcLen == wxNO_LEN )
1000 return wxCONV_FAILED;
1001
1002 const size_t inLen = srcLen / BYTES_PER_CHAR;
1003 if ( dst )
1004 {
1005 if ( dstLen < inLen )
1006 return wxCONV_FAILED;
1007
1008 memcpy(dst, src, srcLen);
1009 }
1010
1011 return inLen;
1012 }
1013
1014 size_t
1015 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1016 const wchar_t *src, size_t srcLen) const
1017 {
1018 if ( srcLen == wxNO_LEN )
1019 srcLen = wxWcslen(src) + 1;
1020
1021 srcLen *= BYTES_PER_CHAR;
1022
1023 if ( dst )
1024 {
1025 if ( dstLen < srcLen )
1026 return wxCONV_FAILED;
1027
1028 memcpy(dst, src, srcLen);
1029 }
1030
1031 return srcLen;
1032 }
1033
1034 // ----------------------------------------------------------------------------
1035 // endian-reversing conversions
1036 // ----------------------------------------------------------------------------
1037
1038 size_t
1039 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1040 const char *src, size_t srcLen) const
1041 {
1042 srcLen = GetLength(src, srcLen);
1043 if ( srcLen == wxNO_LEN )
1044 return wxCONV_FAILED;
1045
1046 srcLen /= BYTES_PER_CHAR;
1047
1048 if ( dst )
1049 {
1050 if ( dstLen < srcLen )
1051 return wxCONV_FAILED;
1052
1053 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1054 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1055 {
1056 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1057 }
1058 }
1059
1060 return srcLen;
1061 }
1062
1063 size_t
1064 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1065 const wchar_t *src, size_t srcLen) const
1066 {
1067 if ( srcLen == wxNO_LEN )
1068 srcLen = wxWcslen(src) + 1;
1069
1070 srcLen *= BYTES_PER_CHAR;
1071
1072 if ( dst )
1073 {
1074 if ( dstLen < srcLen )
1075 return wxCONV_FAILED;
1076
1077 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1078 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1079 {
1080 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1081 }
1082 }
1083
1084 return srcLen;
1085 }
1086
1087 #else // !WC_UTF16: wchar_t is UTF-32
1088
1089 // ----------------------------------------------------------------------------
1090 // conversions without endianness change
1091 // ----------------------------------------------------------------------------
1092
1093 size_t
1094 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1095 const char *src, size_t srcLen) const
1096 {
1097 srcLen = GetLength(src, srcLen);
1098 if ( srcLen == wxNO_LEN )
1099 return wxCONV_FAILED;
1100
1101 const size_t inLen = srcLen / BYTES_PER_CHAR;
1102 if ( !dst )
1103 {
1104 // optimization: return maximal space which could be needed for this
1105 // string even if the real size could be smaller if the buffer contains
1106 // any surrogates
1107 return inLen;
1108 }
1109
1110 size_t outLen = 0;
1111 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1112 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1113 {
1114 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1115 if ( !inBuff )
1116 return wxCONV_FAILED;
1117
1118 if ( ++outLen > dstLen )
1119 return wxCONV_FAILED;
1120
1121 *dst++ = ch;
1122 }
1123
1124
1125 return outLen;
1126 }
1127
1128 size_t
1129 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1130 const wchar_t *src, size_t srcLen) const
1131 {
1132 if ( srcLen == wxNO_LEN )
1133 srcLen = wxWcslen(src) + 1;
1134
1135 size_t outLen = 0;
1136 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1137 for ( size_t n = 0; n < srcLen; n++ )
1138 {
1139 wxUint16 cc[2];
1140 const size_t numChars = encode_utf16(*src++, cc);
1141 if ( numChars == wxCONV_FAILED )
1142 return wxCONV_FAILED;
1143
1144 outLen += numChars * BYTES_PER_CHAR;
1145 if ( outBuff )
1146 {
1147 if ( outLen > dstLen )
1148 return wxCONV_FAILED;
1149
1150 *outBuff++ = cc[0];
1151 if ( numChars == 2 )
1152 {
1153 // second character of a surrogate
1154 *outBuff++ = cc[1];
1155 }
1156 }
1157 }
1158
1159 return outLen;
1160 }
1161
1162 // ----------------------------------------------------------------------------
1163 // endian-reversing conversions
1164 // ----------------------------------------------------------------------------
1165
1166 size_t
1167 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1168 const char *src, size_t srcLen) const
1169 {
1170 srcLen = GetLength(src, srcLen);
1171 if ( srcLen == wxNO_LEN )
1172 return wxCONV_FAILED;
1173
1174 const size_t inLen = srcLen / BYTES_PER_CHAR;
1175 if ( !dst )
1176 {
1177 // optimization: return maximal space which could be needed for this
1178 // string even if the real size could be smaller if the buffer contains
1179 // any surrogates
1180 return inLen;
1181 }
1182
1183 size_t outLen = 0;
1184 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1185 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1186 {
1187 wxUint32 ch;
1188 wxUint16 tmp[2];
1189
1190 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1191 inBuff++;
1192 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1193
1194 const size_t numChars = decode_utf16(tmp, ch);
1195 if ( numChars == wxCONV_FAILED )
1196 return wxCONV_FAILED;
1197
1198 if ( numChars == 2 )
1199 inBuff++;
1200
1201 if ( ++outLen > dstLen )
1202 return wxCONV_FAILED;
1203
1204 *dst++ = ch;
1205 }
1206
1207
1208 return outLen;
1209 }
1210
1211 size_t
1212 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1213 const wchar_t *src, size_t srcLen) const
1214 {
1215 if ( srcLen == wxNO_LEN )
1216 srcLen = wxWcslen(src) + 1;
1217
1218 size_t outLen = 0;
1219 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1220 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1221 {
1222 wxUint16 cc[2];
1223 const size_t numChars = encode_utf16(*src, cc);
1224 if ( numChars == wxCONV_FAILED )
1225 return wxCONV_FAILED;
1226
1227 outLen += numChars * BYTES_PER_CHAR;
1228 if ( outBuff )
1229 {
1230 if ( outLen > dstLen )
1231 return wxCONV_FAILED;
1232
1233 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1234 if ( numChars == 2 )
1235 {
1236 // second character of a surrogate
1237 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1238 }
1239 }
1240 }
1241
1242 return outLen;
1243 }
1244
1245 #endif // WC_UTF16/!WC_UTF16
1246
1247
1248 // ============================================================================
1249 // UTF-32
1250 // ============================================================================
1251
1252 #ifdef WORDS_BIGENDIAN
1253 #define wxMBConvUTF32straight wxMBConvUTF32BE
1254 #define wxMBConvUTF32swap wxMBConvUTF32LE
1255 #else
1256 #define wxMBConvUTF32swap wxMBConvUTF32BE
1257 #define wxMBConvUTF32straight wxMBConvUTF32LE
1258 #endif
1259
1260
1261 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1262 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1263
1264 /* static */
1265 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1266 {
1267 if ( srcLen == wxNO_LEN )
1268 {
1269 // count the number of bytes in input, including the trailing NULs
1270 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1271 for ( srcLen = 1; *inBuff++; srcLen++ )
1272 ;
1273
1274 srcLen *= BYTES_PER_CHAR;
1275 }
1276 else // we already have the length
1277 {
1278 // we can only convert an entire number of UTF-32 characters
1279 if ( srcLen % BYTES_PER_CHAR )
1280 return wxCONV_FAILED;
1281 }
1282
1283 return srcLen;
1284 }
1285
1286 // case when in-memory representation is UTF-16
1287 #ifdef WC_UTF16
1288
1289 // ----------------------------------------------------------------------------
1290 // conversions without endianness change
1291 // ----------------------------------------------------------------------------
1292
1293 size_t
1294 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1295 const char *src, size_t srcLen) const
1296 {
1297 srcLen = GetLength(src, srcLen);
1298 if ( srcLen == wxNO_LEN )
1299 return wxCONV_FAILED;
1300
1301 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1302 const size_t inLen = srcLen / BYTES_PER_CHAR;
1303 size_t outLen = 0;
1304 for ( size_t n = 0; n < inLen; n++ )
1305 {
1306 wxUint16 cc[2];
1307 const size_t numChars = encode_utf16(*inBuff++, cc);
1308 if ( numChars == wxCONV_FAILED )
1309 return wxCONV_FAILED;
1310
1311 outLen += numChars;
1312 if ( dst )
1313 {
1314 if ( outLen > dstLen )
1315 return wxCONV_FAILED;
1316
1317 *dst++ = cc[0];
1318 if ( numChars == 2 )
1319 {
1320 // second character of a surrogate
1321 *dst++ = cc[1];
1322 }
1323 }
1324 }
1325
1326 return outLen;
1327 }
1328
1329 size_t
1330 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1331 const wchar_t *src, size_t srcLen) const
1332 {
1333 if ( srcLen == wxNO_LEN )
1334 srcLen = wxWcslen(src) + 1;
1335
1336 if ( !dst )
1337 {
1338 // optimization: return maximal space which could be needed for this
1339 // string instead of the exact amount which could be less if there are
1340 // any surrogates in the input
1341 //
1342 // we consider that surrogates are rare enough to make it worthwhile to
1343 // avoid running the loop below at the cost of slightly extra memory
1344 // consumption
1345 return srcLen * BYTES_PER_CHAR;
1346 }
1347
1348 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1349 size_t outLen = 0;
1350 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1351 {
1352 const wxUint32 ch = wxDecodeSurrogate(&src);
1353 if ( !src )
1354 return wxCONV_FAILED;
1355
1356 outLen += BYTES_PER_CHAR;
1357
1358 if ( outLen > dstLen )
1359 return wxCONV_FAILED;
1360
1361 *outBuff++ = ch;
1362 }
1363
1364 return outLen;
1365 }
1366
1367 // ----------------------------------------------------------------------------
1368 // endian-reversing conversions
1369 // ----------------------------------------------------------------------------
1370
1371 size_t
1372 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1373 const char *src, size_t srcLen) const
1374 {
1375 srcLen = GetLength(src, srcLen);
1376 if ( srcLen == wxNO_LEN )
1377 return wxCONV_FAILED;
1378
1379 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1380 const size_t inLen = srcLen / BYTES_PER_CHAR;
1381 size_t outLen = 0;
1382 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1383 {
1384 wxUint16 cc[2];
1385 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1386 if ( numChars == wxCONV_FAILED )
1387 return wxCONV_FAILED;
1388
1389 outLen += numChars;
1390 if ( dst )
1391 {
1392 if ( outLen > dstLen )
1393 return wxCONV_FAILED;
1394
1395 *dst++ = cc[0];
1396 if ( numChars == 2 )
1397 {
1398 // second character of a surrogate
1399 *dst++ = cc[1];
1400 }
1401 }
1402 }
1403
1404 return outLen;
1405 }
1406
1407 size_t
1408 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1409 const wchar_t *src, size_t srcLen) const
1410 {
1411 if ( srcLen == wxNO_LEN )
1412 srcLen = wxWcslen(src) + 1;
1413
1414 if ( !dst )
1415 {
1416 // optimization: return maximal space which could be needed for this
1417 // string instead of the exact amount which could be less if there are
1418 // any surrogates in the input
1419 //
1420 // we consider that surrogates are rare enough to make it worthwhile to
1421 // avoid running the loop below at the cost of slightly extra memory
1422 // consumption
1423 return srcLen*BYTES_PER_CHAR;
1424 }
1425
1426 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1427 size_t outLen = 0;
1428 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1429 {
1430 const wxUint32 ch = wxDecodeSurrogate(&src);
1431 if ( !src )
1432 return wxCONV_FAILED;
1433
1434 outLen += BYTES_PER_CHAR;
1435
1436 if ( outLen > dstLen )
1437 return wxCONV_FAILED;
1438
1439 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1440 }
1441
1442 return outLen;
1443 }
1444
1445 #else // !WC_UTF16: wchar_t is UTF-32
1446
1447 // ----------------------------------------------------------------------------
1448 // conversions without endianness change
1449 // ----------------------------------------------------------------------------
1450
1451 size_t
1452 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1453 const char *src, size_t srcLen) const
1454 {
1455 // use memcpy() as it should be much faster than hand-written loop
1456 srcLen = GetLength(src, srcLen);
1457 if ( srcLen == wxNO_LEN )
1458 return wxCONV_FAILED;
1459
1460 const size_t inLen = srcLen/BYTES_PER_CHAR;
1461 if ( dst )
1462 {
1463 if ( dstLen < inLen )
1464 return wxCONV_FAILED;
1465
1466 memcpy(dst, src, srcLen);
1467 }
1468
1469 return inLen;
1470 }
1471
1472 size_t
1473 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1474 const wchar_t *src, size_t srcLen) const
1475 {
1476 if ( srcLen == wxNO_LEN )
1477 srcLen = wxWcslen(src) + 1;
1478
1479 srcLen *= BYTES_PER_CHAR;
1480
1481 if ( dst )
1482 {
1483 if ( dstLen < srcLen )
1484 return wxCONV_FAILED;
1485
1486 memcpy(dst, src, srcLen);
1487 }
1488
1489 return srcLen;
1490 }
1491
1492 // ----------------------------------------------------------------------------
1493 // endian-reversing conversions
1494 // ----------------------------------------------------------------------------
1495
1496 size_t
1497 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1498 const char *src, size_t srcLen) const
1499 {
1500 srcLen = GetLength(src, srcLen);
1501 if ( srcLen == wxNO_LEN )
1502 return wxCONV_FAILED;
1503
1504 srcLen /= BYTES_PER_CHAR;
1505
1506 if ( dst )
1507 {
1508 if ( dstLen < srcLen )
1509 return wxCONV_FAILED;
1510
1511 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1512 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1513 {
1514 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1515 }
1516 }
1517
1518 return srcLen;
1519 }
1520
1521 size_t
1522 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1523 const wchar_t *src, size_t srcLen) const
1524 {
1525 if ( srcLen == wxNO_LEN )
1526 srcLen = wxWcslen(src) + 1;
1527
1528 srcLen *= BYTES_PER_CHAR;
1529
1530 if ( dst )
1531 {
1532 if ( dstLen < srcLen )
1533 return wxCONV_FAILED;
1534
1535 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1536 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1537 {
1538 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1539 }
1540 }
1541
1542 return srcLen;
1543 }
1544
1545 #endif // WC_UTF16/!WC_UTF16
1546
1547
1548 // ============================================================================
1549 // The classes doing conversion using the iconv_xxx() functions
1550 // ============================================================================
1551
1552 #ifdef HAVE_ICONV
1553
1554 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1555 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1556 // (unless there's yet another bug in glibc) the only case when iconv()
1557 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1558 // left in the input buffer -- when _real_ error occurs,
1559 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1560 // iconv() failure.
1561 // [This bug does not appear in glibc 2.2.]
1562 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1563 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1564 (errno != E2BIG || bufLeft != 0))
1565 #else
1566 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1567 #endif
1568
1569 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1570
1571 #define ICONV_T_INVALID ((iconv_t)-1)
1572
1573 #if SIZEOF_WCHAR_T == 4
1574 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1575 #define WC_ENC wxFONTENCODING_UTF32
1576 #elif SIZEOF_WCHAR_T == 2
1577 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1578 #define WC_ENC wxFONTENCODING_UTF16
1579 #else // sizeof(wchar_t) != 2 nor 4
1580 // does this ever happen?
1581 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1582 #endif
1583
1584 // ----------------------------------------------------------------------------
1585 // wxMBConv_iconv: encapsulates an iconv character set
1586 // ----------------------------------------------------------------------------
1587
1588 class wxMBConv_iconv : public wxMBConv
1589 {
1590 public:
1591 wxMBConv_iconv(const char *name);
1592 virtual ~wxMBConv_iconv();
1593
1594 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1595 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1596
1597 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1598 virtual size_t GetMBNulLen() const;
1599
1600 #if wxUSE_UNICODE_UTF8
1601 virtual bool IsUTF8() const;
1602 #endif
1603
1604 virtual wxMBConv *Clone() const
1605 {
1606 wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1607 p->m_minMBCharWidth = m_minMBCharWidth;
1608 return p;
1609 }
1610
1611 bool IsOk() const
1612 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1613
1614 protected:
1615 // the iconv handlers used to translate from multibyte
1616 // to wide char and in the other direction
1617 iconv_t m2w,
1618 w2m;
1619
1620 #if wxUSE_THREADS
1621 // guards access to m2w and w2m objects
1622 wxMutex m_iconvMutex;
1623 #endif
1624
1625 private:
1626 // the name (for iconv_open()) of a wide char charset -- if none is
1627 // available on this machine, it will remain NULL
1628 static wxString ms_wcCharsetName;
1629
1630 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1631 // different endian-ness than the native one
1632 static bool ms_wcNeedsSwap;
1633
1634
1635 // name of the encoding handled by this conversion
1636 wxString m_name;
1637
1638 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1639 // initially
1640 size_t m_minMBCharWidth;
1641 };
1642
1643 // make the constructor available for unit testing
1644 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1645 {
1646 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1647 if ( !result->IsOk() )
1648 {
1649 delete result;
1650 return 0;
1651 }
1652
1653 return result;
1654 }
1655
1656 wxString wxMBConv_iconv::ms_wcCharsetName;
1657 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1658
1659 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1660 : m_name(name)
1661 {
1662 m_minMBCharWidth = 0;
1663
1664 // check for charset that represents wchar_t:
1665 if ( ms_wcCharsetName.empty() )
1666 {
1667 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1668
1669 #if wxUSE_FONTMAP
1670 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1671 #else // !wxUSE_FONTMAP
1672 static const wxChar *names_static[] =
1673 {
1674 #if SIZEOF_WCHAR_T == 4
1675 _T("UCS-4"),
1676 #elif SIZEOF_WCHAR_T = 2
1677 _T("UCS-2"),
1678 #endif
1679 NULL
1680 };
1681 const wxChar **names = names_static;
1682 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1683
1684 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1685 {
1686 const wxString nameCS(*names);
1687
1688 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1689 wxString nameXE(nameCS);
1690
1691 #ifdef WORDS_BIGENDIAN
1692 nameXE += _T("BE");
1693 #else // little endian
1694 nameXE += _T("LE");
1695 #endif
1696
1697 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1698 nameXE.c_str());
1699
1700 m2w = iconv_open(nameXE.ToAscii(), name);
1701 if ( m2w == ICONV_T_INVALID )
1702 {
1703 // try charset w/o bytesex info (e.g. "UCS4")
1704 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1705 nameCS.c_str());
1706 m2w = iconv_open(nameCS.ToAscii(), name);
1707
1708 // and check for bytesex ourselves:
1709 if ( m2w != ICONV_T_INVALID )
1710 {
1711 char buf[2], *bufPtr;
1712 wchar_t wbuf[2], *wbufPtr;
1713 size_t insz, outsz;
1714 size_t res;
1715
1716 buf[0] = 'A';
1717 buf[1] = 0;
1718 wbuf[0] = 0;
1719 insz = 2;
1720 outsz = SIZEOF_WCHAR_T * 2;
1721 wbufPtr = wbuf;
1722 bufPtr = buf;
1723
1724 res = iconv(
1725 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1726 (char**)&wbufPtr, &outsz);
1727
1728 if (ICONV_FAILED(res, insz))
1729 {
1730 wxLogLastError(wxT("iconv"));
1731 wxLogError(_("Conversion to charset '%s' doesn't work."),
1732 nameCS.c_str());
1733 }
1734 else // ok, can convert to this encoding, remember it
1735 {
1736 ms_wcCharsetName = nameCS;
1737 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1738 }
1739 }
1740 }
1741 else // use charset not requiring byte swapping
1742 {
1743 ms_wcCharsetName = nameXE;
1744 }
1745 }
1746
1747 wxLogTrace(TRACE_STRCONV,
1748 wxT("iconv wchar_t charset is \"%s\"%s"),
1749 ms_wcCharsetName.empty() ? wxString("<none>")
1750 : ms_wcCharsetName,
1751 ms_wcNeedsSwap ? _T(" (needs swap)")
1752 : _T(""));
1753 }
1754 else // we already have ms_wcCharsetName
1755 {
1756 m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
1757 }
1758
1759 if ( ms_wcCharsetName.empty() )
1760 {
1761 w2m = ICONV_T_INVALID;
1762 }
1763 else
1764 {
1765 w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
1766 if ( w2m == ICONV_T_INVALID )
1767 {
1768 wxLogTrace(TRACE_STRCONV,
1769 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1770 ms_wcCharsetName.c_str(), name);
1771 }
1772 }
1773 }
1774
1775 wxMBConv_iconv::~wxMBConv_iconv()
1776 {
1777 if ( m2w != ICONV_T_INVALID )
1778 iconv_close(m2w);
1779 if ( w2m != ICONV_T_INVALID )
1780 iconv_close(w2m);
1781 }
1782
1783 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1784 {
1785 // find the string length: notice that must be done differently for
1786 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1787 size_t inbuf;
1788 const size_t nulLen = GetMBNulLen();
1789 switch ( nulLen )
1790 {
1791 default:
1792 return wxCONV_FAILED;
1793
1794 case 1:
1795 inbuf = strlen(psz); // arguably more optimized than our version
1796 break;
1797
1798 case 2:
1799 case 4:
1800 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1801 // they also have to start at character boundary and not span two
1802 // adjacent characters
1803 const char *p;
1804 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1805 ;
1806 inbuf = p - psz;
1807 break;
1808 }
1809
1810 #if wxUSE_THREADS
1811 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1812 // Unfortunately there are a couple of global wxCSConv objects such as
1813 // wxConvLocal that are used all over wx code, so we have to make sure
1814 // the handle is used by at most one thread at the time. Otherwise
1815 // only a few wx classes would be safe to use from non-main threads
1816 // as MB<->WC conversion would fail "randomly".
1817 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1818 #endif // wxUSE_THREADS
1819
1820 size_t outbuf = n * SIZEOF_WCHAR_T;
1821 size_t res, cres;
1822 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1823 wchar_t *bufPtr = buf;
1824 const char *pszPtr = psz;
1825
1826 if (buf)
1827 {
1828 // have destination buffer, convert there
1829 cres = iconv(m2w,
1830 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1831 (char**)&bufPtr, &outbuf);
1832 res = n - (outbuf / SIZEOF_WCHAR_T);
1833
1834 if (ms_wcNeedsSwap)
1835 {
1836 // convert to native endianness
1837 for ( unsigned i = 0; i < res; i++ )
1838 buf[n] = WC_BSWAP(buf[i]);
1839 }
1840
1841 // NUL-terminate the string if there is any space left
1842 if (res < n)
1843 buf[res] = 0;
1844 }
1845 else
1846 {
1847 // no destination buffer... convert using temp buffer
1848 // to calculate destination buffer requirement
1849 wchar_t tbuf[8];
1850 res = 0;
1851
1852 do
1853 {
1854 bufPtr = tbuf;
1855 outbuf = 8 * SIZEOF_WCHAR_T;
1856
1857 cres = iconv(m2w,
1858 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1859 (char**)&bufPtr, &outbuf );
1860
1861 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1862 }
1863 while ((cres == (size_t)-1) && (errno == E2BIG));
1864 }
1865
1866 if (ICONV_FAILED(cres, inbuf))
1867 {
1868 //VS: it is ok if iconv fails, hence trace only
1869 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1870 return wxCONV_FAILED;
1871 }
1872
1873 return res;
1874 }
1875
1876 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1877 {
1878 #if wxUSE_THREADS
1879 // NB: explained in MB2WC
1880 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1881 #endif
1882
1883 size_t inlen = wxWcslen(psz);
1884 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1885 size_t outbuf = n;
1886 size_t res, cres;
1887
1888 wchar_t *tmpbuf = 0;
1889
1890 if (ms_wcNeedsSwap)
1891 {
1892 // need to copy to temp buffer to switch endianness
1893 // (doing WC_BSWAP twice on the original buffer won't help, as it
1894 // could be in read-only memory, or be accessed in some other thread)
1895 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1896 for ( size_t i = 0; i < inlen; i++ )
1897 tmpbuf[n] = WC_BSWAP(psz[i]);
1898
1899 tmpbuf[inlen] = L'\0';
1900 psz = tmpbuf;
1901 }
1902
1903 if (buf)
1904 {
1905 // have destination buffer, convert there
1906 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1907
1908 res = n - outbuf;
1909
1910 // NB: iconv was given only wcslen(psz) characters on input, and so
1911 // it couldn't convert the trailing zero. Let's do it ourselves
1912 // if there's some room left for it in the output buffer.
1913 if (res < n)
1914 buf[0] = 0;
1915 }
1916 else
1917 {
1918 // no destination buffer: convert using temp buffer
1919 // to calculate destination buffer requirement
1920 char tbuf[16];
1921 res = 0;
1922 do
1923 {
1924 buf = tbuf;
1925 outbuf = 16;
1926
1927 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1928
1929 res += 16 - outbuf;
1930 }
1931 while ((cres == (size_t)-1) && (errno == E2BIG));
1932 }
1933
1934 if (ms_wcNeedsSwap)
1935 {
1936 free(tmpbuf);
1937 }
1938
1939 if (ICONV_FAILED(cres, inbuf))
1940 {
1941 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1942 return wxCONV_FAILED;
1943 }
1944
1945 return res;
1946 }
1947
1948 size_t wxMBConv_iconv::GetMBNulLen() const
1949 {
1950 if ( m_minMBCharWidth == 0 )
1951 {
1952 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1953
1954 #if wxUSE_THREADS
1955 // NB: explained in MB2WC
1956 wxMutexLocker lock(self->m_iconvMutex);
1957 #endif
1958
1959 const wchar_t *wnul = L"";
1960 char buf[8]; // should be enough for NUL in any encoding
1961 size_t inLen = sizeof(wchar_t),
1962 outLen = WXSIZEOF(buf);
1963 char *inBuff = (char *)wnul;
1964 char *outBuff = buf;
1965 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1966 {
1967 self->m_minMBCharWidth = (size_t)-1;
1968 }
1969 else // ok
1970 {
1971 self->m_minMBCharWidth = outBuff - buf;
1972 }
1973 }
1974
1975 return m_minMBCharWidth;
1976 }
1977
1978 #if wxUSE_UNICODE_UTF8
1979 bool wxMBConv_iconv::IsUTF8() const
1980 {
1981 return wxStricmp(m_name, "UTF-8") == 0 ||
1982 wxStricmp(m_name, "UTF8") == 0;
1983 }
1984 #endif
1985
1986 #endif // HAVE_ICONV
1987
1988
1989 // ============================================================================
1990 // Win32 conversion classes
1991 // ============================================================================
1992
1993 #ifdef wxHAVE_WIN32_MB2WC
1994
1995 // from utils.cpp
1996 #if wxUSE_FONTMAP
1997 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
1998 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1999 #endif
2000
2001 class wxMBConv_win32 : public wxMBConv
2002 {
2003 public:
2004 wxMBConv_win32()
2005 {
2006 m_CodePage = CP_ACP;
2007 m_minMBCharWidth = 0;
2008 }
2009
2010 wxMBConv_win32(const wxMBConv_win32& conv)
2011 : wxMBConv()
2012 {
2013 m_CodePage = conv.m_CodePage;
2014 m_minMBCharWidth = conv.m_minMBCharWidth;
2015 }
2016
2017 #if wxUSE_FONTMAP
2018 wxMBConv_win32(const char* name)
2019 {
2020 m_CodePage = wxCharsetToCodepage(name);
2021 m_minMBCharWidth = 0;
2022 }
2023
2024 wxMBConv_win32(wxFontEncoding encoding)
2025 {
2026 m_CodePage = wxEncodingToCodepage(encoding);
2027 m_minMBCharWidth = 0;
2028 }
2029 #endif // wxUSE_FONTMAP
2030
2031 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2032 {
2033 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2034 // the behaviour is not compatible with the Unix version (using iconv)
2035 // and break the library itself, e.g. wxTextInputStream::NextChar()
2036 // wouldn't work if reading an incomplete MB char didn't result in an
2037 // error
2038 //
2039 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2040 // Win XP or newer and it is not supported for UTF-[78] so we always
2041 // use our own conversions in this case. See
2042 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2043 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2044 if ( m_CodePage == CP_UTF8 )
2045 {
2046 return wxMBConvUTF8().MB2WC(buf, psz, n);
2047 }
2048
2049 if ( m_CodePage == CP_UTF7 )
2050 {
2051 return wxMBConvUTF7().MB2WC(buf, psz, n);
2052 }
2053
2054 int flags = 0;
2055 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2056 IsAtLeastWin2kSP4() )
2057 {
2058 flags = MB_ERR_INVALID_CHARS;
2059 }
2060
2061 const size_t len = ::MultiByteToWideChar
2062 (
2063 m_CodePage, // code page
2064 flags, // flags: fall on error
2065 psz, // input string
2066 -1, // its length (NUL-terminated)
2067 buf, // output string
2068 buf ? n : 0 // size of output buffer
2069 );
2070 if ( !len )
2071 {
2072 // function totally failed
2073 return wxCONV_FAILED;
2074 }
2075
2076 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2077 // check if we succeeded, by doing a double trip:
2078 if ( !flags && buf )
2079 {
2080 const size_t mbLen = strlen(psz);
2081 wxCharBuffer mbBuf(mbLen);
2082 if ( ::WideCharToMultiByte
2083 (
2084 m_CodePage,
2085 0,
2086 buf,
2087 -1,
2088 mbBuf.data(),
2089 mbLen + 1, // size in bytes, not length
2090 NULL,
2091 NULL
2092 ) == 0 ||
2093 strcmp(mbBuf, psz) != 0 )
2094 {
2095 // we didn't obtain the same thing we started from, hence
2096 // the conversion was lossy and we consider that it failed
2097 return wxCONV_FAILED;
2098 }
2099 }
2100
2101 // note that it returns count of written chars for buf != NULL and size
2102 // of the needed buffer for buf == NULL so in either case the length of
2103 // the string (which never includes the terminating NUL) is one less
2104 return len - 1;
2105 }
2106
2107 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2108 {
2109 /*
2110 we have a problem here: by default, WideCharToMultiByte() may
2111 replace characters unrepresentable in the target code page with bad
2112 quality approximations such as turning "1/2" symbol (U+00BD) into
2113 "1" for the code pages which don't have it and we, obviously, want
2114 to avoid this at any price
2115
2116 the trouble is that this function does it _silently_, i.e. it won't
2117 even tell us whether it did or not... Win98/2000 and higher provide
2118 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2119 we have to resort to a round trip, i.e. check that converting back
2120 results in the same string -- this is, of course, expensive but
2121 otherwise we simply can't be sure to not garble the data.
2122 */
2123
2124 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2125 // it doesn't work with CJK encodings (which we test for rather roughly
2126 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2127 // supporting it
2128 BOOL usedDef wxDUMMY_INITIALIZE(false);
2129 BOOL *pUsedDef;
2130 int flags;
2131 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2132 {
2133 // it's our lucky day
2134 flags = WC_NO_BEST_FIT_CHARS;
2135 pUsedDef = &usedDef;
2136 }
2137 else // old system or unsupported encoding
2138 {
2139 flags = 0;
2140 pUsedDef = NULL;
2141 }
2142
2143 const size_t len = ::WideCharToMultiByte
2144 (
2145 m_CodePage, // code page
2146 flags, // either none or no best fit
2147 pwz, // input string
2148 -1, // it is (wide) NUL-terminated
2149 buf, // output buffer
2150 buf ? n : 0, // and its size
2151 NULL, // default "replacement" char
2152 pUsedDef // [out] was it used?
2153 );
2154
2155 if ( !len )
2156 {
2157 // function totally failed
2158 return wxCONV_FAILED;
2159 }
2160
2161 // if we were really converting, check if we succeeded
2162 if ( buf )
2163 {
2164 if ( flags )
2165 {
2166 // check if the conversion failed, i.e. if any replacements
2167 // were done
2168 if ( usedDef )
2169 return wxCONV_FAILED;
2170 }
2171 else // we must resort to double tripping...
2172 {
2173 wxWCharBuffer wcBuf(n);
2174 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2175 wcscmp(wcBuf, pwz) != 0 )
2176 {
2177 // we didn't obtain the same thing we started from, hence
2178 // the conversion was lossy and we consider that it failed
2179 return wxCONV_FAILED;
2180 }
2181 }
2182 }
2183
2184 // see the comment above for the reason of "len - 1"
2185 return len - 1;
2186 }
2187
2188 virtual size_t GetMBNulLen() const
2189 {
2190 if ( m_minMBCharWidth == 0 )
2191 {
2192 int len = ::WideCharToMultiByte
2193 (
2194 m_CodePage, // code page
2195 0, // no flags
2196 L"", // input string
2197 1, // translate just the NUL
2198 NULL, // output buffer
2199 0, // and its size
2200 NULL, // no replacement char
2201 NULL // [out] don't care if it was used
2202 );
2203
2204 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2205 switch ( len )
2206 {
2207 default:
2208 wxLogDebug(_T("Unexpected NUL length %d"), len);
2209 self->m_minMBCharWidth = (size_t)-1;
2210 break;
2211
2212 case 0:
2213 self->m_minMBCharWidth = (size_t)-1;
2214 break;
2215
2216 case 1:
2217 case 2:
2218 case 4:
2219 self->m_minMBCharWidth = len;
2220 break;
2221 }
2222 }
2223
2224 return m_minMBCharWidth;
2225 }
2226
2227 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2228
2229 bool IsOk() const { return m_CodePage != -1; }
2230
2231 private:
2232 static bool CanUseNoBestFit()
2233 {
2234 static int s_isWin98Or2k = -1;
2235
2236 if ( s_isWin98Or2k == -1 )
2237 {
2238 int verMaj, verMin;
2239 switch ( wxGetOsVersion(&verMaj, &verMin) )
2240 {
2241 case wxOS_WINDOWS_9X:
2242 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2243 break;
2244
2245 case wxOS_WINDOWS_NT:
2246 s_isWin98Or2k = verMaj >= 5;
2247 break;
2248
2249 default:
2250 // unknown: be conservative by default
2251 s_isWin98Or2k = 0;
2252 break;
2253 }
2254
2255 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2256 }
2257
2258 return s_isWin98Or2k == 1;
2259 }
2260
2261 static bool IsAtLeastWin2kSP4()
2262 {
2263 #ifdef __WXWINCE__
2264 return false;
2265 #else
2266 static int s_isAtLeastWin2kSP4 = -1;
2267
2268 if ( s_isAtLeastWin2kSP4 == -1 )
2269 {
2270 OSVERSIONINFOEX ver;
2271
2272 memset(&ver, 0, sizeof(ver));
2273 ver.dwOSVersionInfoSize = sizeof(ver);
2274 GetVersionEx((OSVERSIONINFO*)&ver);
2275
2276 s_isAtLeastWin2kSP4 =
2277 ((ver.dwMajorVersion > 5) || // Vista+
2278 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2279 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2280 ver.wServicePackMajor >= 4)) // 2000 SP4+
2281 ? 1 : 0;
2282 }
2283
2284 return s_isAtLeastWin2kSP4 == 1;
2285 #endif
2286 }
2287
2288
2289 // the code page we're working with
2290 long m_CodePage;
2291
2292 // cached result of GetMBNulLen(), set to 0 initially meaning
2293 // "unknown"
2294 size_t m_minMBCharWidth;
2295 };
2296
2297 #endif // wxHAVE_WIN32_MB2WC
2298
2299 // ============================================================================
2300 // Cocoa conversion classes
2301 // ============================================================================
2302
2303 // DE: Does anyone know the purpose of this code?
2304 // This file is compiled in the base library, so __WXCOCOA__ check is totally wrong
2305 // in the first place.
2306 #if 0 // defined(__WXCOCOA__)
2307
2308 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2309 // Strangely enough, internally Core Foundation uses
2310 // UTF-32 internally quite a bit - its just not public (yet).
2311
2312 #include <CoreFoundation/CFString.h>
2313 #include <CoreFoundation/CFStringEncodingExt.h>
2314
2315 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2316 {
2317 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2318
2319 switch (encoding)
2320 {
2321 case wxFONTENCODING_DEFAULT :
2322 enc = CFStringGetSystemEncoding();
2323 break ;
2324
2325 case wxFONTENCODING_ISO8859_1 :
2326 enc = kCFStringEncodingISOLatin1 ;
2327 break ;
2328 case wxFONTENCODING_ISO8859_2 :
2329 enc = kCFStringEncodingISOLatin2;
2330 break ;
2331 case wxFONTENCODING_ISO8859_3 :
2332 enc = kCFStringEncodingISOLatin3 ;
2333 break ;
2334 case wxFONTENCODING_ISO8859_4 :
2335 enc = kCFStringEncodingISOLatin4;
2336 break ;
2337 case wxFONTENCODING_ISO8859_5 :
2338 enc = kCFStringEncodingISOLatinCyrillic;
2339 break ;
2340 case wxFONTENCODING_ISO8859_6 :
2341 enc = kCFStringEncodingISOLatinArabic;
2342 break ;
2343 case wxFONTENCODING_ISO8859_7 :
2344 enc = kCFStringEncodingISOLatinGreek;
2345 break ;
2346 case wxFONTENCODING_ISO8859_8 :
2347 enc = kCFStringEncodingISOLatinHebrew;
2348 break ;
2349 case wxFONTENCODING_ISO8859_9 :
2350 enc = kCFStringEncodingISOLatin5;
2351 break ;
2352 case wxFONTENCODING_ISO8859_10 :
2353 enc = kCFStringEncodingISOLatin6;
2354 break ;
2355 case wxFONTENCODING_ISO8859_11 :
2356 enc = kCFStringEncodingISOLatinThai;
2357 break ;
2358 case wxFONTENCODING_ISO8859_13 :
2359 enc = kCFStringEncodingISOLatin7;
2360 break ;
2361 case wxFONTENCODING_ISO8859_14 :
2362 enc = kCFStringEncodingISOLatin8;
2363 break ;
2364 case wxFONTENCODING_ISO8859_15 :
2365 enc = kCFStringEncodingISOLatin9;
2366 break ;
2367
2368 case wxFONTENCODING_KOI8 :
2369 enc = kCFStringEncodingKOI8_R;
2370 break ;
2371 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2372 enc = kCFStringEncodingDOSRussian;
2373 break ;
2374
2375 // case wxFONTENCODING_BULGARIAN :
2376 // enc = ;
2377 // break ;
2378
2379 case wxFONTENCODING_CP437 :
2380 enc = kCFStringEncodingDOSLatinUS ;
2381 break ;
2382 case wxFONTENCODING_CP850 :
2383 enc = kCFStringEncodingDOSLatin1;
2384 break ;
2385 case wxFONTENCODING_CP852 :
2386 enc = kCFStringEncodingDOSLatin2;
2387 break ;
2388 case wxFONTENCODING_CP855 :
2389 enc = kCFStringEncodingDOSCyrillic;
2390 break ;
2391 case wxFONTENCODING_CP866 :
2392 enc = kCFStringEncodingDOSRussian ;
2393 break ;
2394 case wxFONTENCODING_CP874 :
2395 enc = kCFStringEncodingDOSThai;
2396 break ;
2397 case wxFONTENCODING_CP932 :
2398 enc = kCFStringEncodingDOSJapanese;
2399 break ;
2400 case wxFONTENCODING_CP936 :
2401 enc = kCFStringEncodingDOSChineseSimplif ;
2402 break ;
2403 case wxFONTENCODING_CP949 :
2404 enc = kCFStringEncodingDOSKorean;
2405 break ;
2406 case wxFONTENCODING_CP950 :
2407 enc = kCFStringEncodingDOSChineseTrad;
2408 break ;
2409 case wxFONTENCODING_CP1250 :
2410 enc = kCFStringEncodingWindowsLatin2;
2411 break ;
2412 case wxFONTENCODING_CP1251 :
2413 enc = kCFStringEncodingWindowsCyrillic ;
2414 break ;
2415 case wxFONTENCODING_CP1252 :
2416 enc = kCFStringEncodingWindowsLatin1 ;
2417 break ;
2418 case wxFONTENCODING_CP1253 :
2419 enc = kCFStringEncodingWindowsGreek;
2420 break ;
2421 case wxFONTENCODING_CP1254 :
2422 enc = kCFStringEncodingWindowsLatin5;
2423 break ;
2424 case wxFONTENCODING_CP1255 :
2425 enc = kCFStringEncodingWindowsHebrew ;
2426 break ;
2427 case wxFONTENCODING_CP1256 :
2428 enc = kCFStringEncodingWindowsArabic ;
2429 break ;
2430 case wxFONTENCODING_CP1257 :
2431 enc = kCFStringEncodingWindowsBalticRim;
2432 break ;
2433 // This only really encodes to UTF7 (if that) evidently
2434 // case wxFONTENCODING_UTF7 :
2435 // enc = kCFStringEncodingNonLossyASCII ;
2436 // break ;
2437 case wxFONTENCODING_UTF8 :
2438 enc = kCFStringEncodingUTF8 ;
2439 break ;
2440 case wxFONTENCODING_EUC_JP :
2441 enc = kCFStringEncodingEUC_JP;
2442 break ;
2443 case wxFONTENCODING_UTF16 :
2444 enc = kCFStringEncodingUnicode ;
2445 break ;
2446 case wxFONTENCODING_MACROMAN :
2447 enc = kCFStringEncodingMacRoman ;
2448 break ;
2449 case wxFONTENCODING_MACJAPANESE :
2450 enc = kCFStringEncodingMacJapanese ;
2451 break ;
2452 case wxFONTENCODING_MACCHINESETRAD :
2453 enc = kCFStringEncodingMacChineseTrad ;
2454 break ;
2455 case wxFONTENCODING_MACKOREAN :
2456 enc = kCFStringEncodingMacKorean ;
2457 break ;
2458 case wxFONTENCODING_MACARABIC :
2459 enc = kCFStringEncodingMacArabic ;
2460 break ;
2461 case wxFONTENCODING_MACHEBREW :
2462 enc = kCFStringEncodingMacHebrew ;
2463 break ;
2464 case wxFONTENCODING_MACGREEK :
2465 enc = kCFStringEncodingMacGreek ;
2466 break ;
2467 case wxFONTENCODING_MACCYRILLIC :
2468 enc = kCFStringEncodingMacCyrillic ;
2469 break ;
2470 case wxFONTENCODING_MACDEVANAGARI :
2471 enc = kCFStringEncodingMacDevanagari ;
2472 break ;
2473 case wxFONTENCODING_MACGURMUKHI :
2474 enc = kCFStringEncodingMacGurmukhi ;
2475 break ;
2476 case wxFONTENCODING_MACGUJARATI :
2477 enc = kCFStringEncodingMacGujarati ;
2478 break ;
2479 case wxFONTENCODING_MACORIYA :
2480 enc = kCFStringEncodingMacOriya ;
2481 break ;
2482 case wxFONTENCODING_MACBENGALI :
2483 enc = kCFStringEncodingMacBengali ;
2484 break ;
2485 case wxFONTENCODING_MACTAMIL :
2486 enc = kCFStringEncodingMacTamil ;
2487 break ;
2488 case wxFONTENCODING_MACTELUGU :
2489 enc = kCFStringEncodingMacTelugu ;
2490 break ;
2491 case wxFONTENCODING_MACKANNADA :
2492 enc = kCFStringEncodingMacKannada ;
2493 break ;
2494 case wxFONTENCODING_MACMALAJALAM :
2495 enc = kCFStringEncodingMacMalayalam ;
2496 break ;
2497 case wxFONTENCODING_MACSINHALESE :
2498 enc = kCFStringEncodingMacSinhalese ;
2499 break ;
2500 case wxFONTENCODING_MACBURMESE :
2501 enc = kCFStringEncodingMacBurmese ;
2502 break ;
2503 case wxFONTENCODING_MACKHMER :
2504 enc = kCFStringEncodingMacKhmer ;
2505 break ;
2506 case wxFONTENCODING_MACTHAI :
2507 enc = kCFStringEncodingMacThai ;
2508 break ;
2509 case wxFONTENCODING_MACLAOTIAN :
2510 enc = kCFStringEncodingMacLaotian ;
2511 break ;
2512 case wxFONTENCODING_MACGEORGIAN :
2513 enc = kCFStringEncodingMacGeorgian ;
2514 break ;
2515 case wxFONTENCODING_MACARMENIAN :
2516 enc = kCFStringEncodingMacArmenian ;
2517 break ;
2518 case wxFONTENCODING_MACCHINESESIMP :
2519 enc = kCFStringEncodingMacChineseSimp ;
2520 break ;
2521 case wxFONTENCODING_MACTIBETAN :
2522 enc = kCFStringEncodingMacTibetan ;
2523 break ;
2524 case wxFONTENCODING_MACMONGOLIAN :
2525 enc = kCFStringEncodingMacMongolian ;
2526 break ;
2527 case wxFONTENCODING_MACETHIOPIC :
2528 enc = kCFStringEncodingMacEthiopic ;
2529 break ;
2530 case wxFONTENCODING_MACCENTRALEUR :
2531 enc = kCFStringEncodingMacCentralEurRoman ;
2532 break ;
2533 case wxFONTENCODING_MACVIATNAMESE :
2534 enc = kCFStringEncodingMacVietnamese ;
2535 break ;
2536 case wxFONTENCODING_MACARABICEXT :
2537 enc = kCFStringEncodingMacExtArabic ;
2538 break ;
2539 case wxFONTENCODING_MACSYMBOL :
2540 enc = kCFStringEncodingMacSymbol ;
2541 break ;
2542 case wxFONTENCODING_MACDINGBATS :
2543 enc = kCFStringEncodingMacDingbats ;
2544 break ;
2545 case wxFONTENCODING_MACTURKISH :
2546 enc = kCFStringEncodingMacTurkish ;
2547 break ;
2548 case wxFONTENCODING_MACCROATIAN :
2549 enc = kCFStringEncodingMacCroatian ;
2550 break ;
2551 case wxFONTENCODING_MACICELANDIC :
2552 enc = kCFStringEncodingMacIcelandic ;
2553 break ;
2554 case wxFONTENCODING_MACROMANIAN :
2555 enc = kCFStringEncodingMacRomanian ;
2556 break ;
2557 case wxFONTENCODING_MACCELTIC :
2558 enc = kCFStringEncodingMacCeltic ;
2559 break ;
2560 case wxFONTENCODING_MACGAELIC :
2561 enc = kCFStringEncodingMacGaelic ;
2562 break ;
2563 // case wxFONTENCODING_MACKEYBOARD :
2564 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2565 // break ;
2566
2567 default :
2568 // because gcc is picky
2569 break ;
2570 }
2571
2572 return enc ;
2573 }
2574
2575 class wxMBConv_cocoa : public wxMBConv
2576 {
2577 public:
2578 wxMBConv_cocoa()
2579 {
2580 Init(CFStringGetSystemEncoding()) ;
2581 }
2582
2583 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2584 {
2585 m_encoding = conv.m_encoding;
2586 }
2587
2588 #if wxUSE_FONTMAP
2589 wxMBConv_cocoa(const wxChar* name)
2590 {
2591 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2592 }
2593 #endif
2594
2595 wxMBConv_cocoa(wxFontEncoding encoding)
2596 {
2597 Init( wxCFStringEncFromFontEnc(encoding) );
2598 }
2599
2600 virtual ~wxMBConv_cocoa()
2601 {
2602 }
2603
2604 void Init( CFStringEncoding encoding)
2605 {
2606 m_encoding = encoding ;
2607 }
2608
2609 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2610 {
2611 wxASSERT(szUnConv);
2612
2613 CFStringRef theString = CFStringCreateWithBytes (
2614 NULL, //the allocator
2615 (const UInt8*)szUnConv,
2616 strlen(szUnConv),
2617 m_encoding,
2618 false //no BOM/external representation
2619 );
2620
2621 wxASSERT(theString);
2622
2623 size_t nOutLength = CFStringGetLength(theString);
2624
2625 if (szOut == NULL)
2626 {
2627 CFRelease(theString);
2628 return nOutLength;
2629 }
2630
2631 CFRange theRange = { 0, nOutSize };
2632
2633 #if SIZEOF_WCHAR_T == 4
2634 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2635 #endif
2636
2637 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2638
2639 CFRelease(theString);
2640
2641 szUniCharBuffer[nOutLength] = '\0';
2642
2643 #if SIZEOF_WCHAR_T == 4
2644 wxMBConvUTF16 converter;
2645 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2646 delete [] szUniCharBuffer;
2647 #endif
2648
2649 return nOutLength;
2650 }
2651
2652 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2653 {
2654 wxASSERT(szUnConv);
2655
2656 size_t nRealOutSize;
2657 size_t nBufSize = wxWcslen(szUnConv);
2658 UniChar* szUniBuffer = (UniChar*) szUnConv;
2659
2660 #if SIZEOF_WCHAR_T == 4
2661 wxMBConvUTF16 converter ;
2662 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2663 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2664 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2665 nBufSize /= sizeof(UniChar);
2666 #endif
2667
2668 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2669 NULL, //allocator
2670 szUniBuffer,
2671 nBufSize,
2672 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2673 );
2674
2675 wxASSERT(theString);
2676
2677 //Note that CER puts a BOM when converting to unicode
2678 //so we check and use getchars instead in that case
2679 if (m_encoding == kCFStringEncodingUnicode)
2680 {
2681 if (szOut != NULL)
2682 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2683
2684 nRealOutSize = CFStringGetLength(theString) + 1;
2685 }
2686 else
2687 {
2688 CFStringGetBytes(
2689 theString,
2690 CFRangeMake(0, CFStringGetLength(theString)),
2691 m_encoding,
2692 0, //what to put in characters that can't be converted -
2693 //0 tells CFString to return NULL if it meets such a character
2694 false, //not an external representation
2695 (UInt8*) szOut,
2696 nOutSize,
2697 (CFIndex*) &nRealOutSize
2698 );
2699 }
2700
2701 CFRelease(theString);
2702
2703 #if SIZEOF_WCHAR_T == 4
2704 delete[] szUniBuffer;
2705 #endif
2706
2707 return nRealOutSize - 1;
2708 }
2709
2710 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2711
2712 bool IsOk() const
2713 {
2714 return m_encoding != kCFStringEncodingInvalidId &&
2715 CFStringIsEncodingAvailable(m_encoding);
2716 }
2717
2718 private:
2719 CFStringEncoding m_encoding ;
2720 };
2721
2722 #endif // defined(__WXCOCOA__)
2723
2724 // ============================================================================
2725 // Mac conversion classes
2726 // ============================================================================
2727
2728 // DE: Can someone explain to me why this is conditional upon __WXMAC__ instead
2729 // of being used for all Mac OS X systems? This file is part of the base library
2730 // not the core library.
2731 // If we really need GUI-specific conversions then a better method might be to
2732 // provide something in wxAppTraits that could be implemented in the core library.
2733 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2734
2735 class wxMBConv_mac : public wxMBConv
2736 {
2737 public:
2738 wxMBConv_mac()
2739 {
2740 Init(CFStringGetSystemEncoding()) ;
2741 }
2742
2743 wxMBConv_mac(const wxMBConv_mac& conv)
2744 {
2745 Init(conv.m_char_encoding);
2746 }
2747
2748 #if wxUSE_FONTMAP
2749 wxMBConv_mac(const char* name)
2750 {
2751 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2752 }
2753 #endif
2754
2755 wxMBConv_mac(wxFontEncoding encoding)
2756 {
2757 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2758 }
2759
2760 virtual ~wxMBConv_mac()
2761 {
2762 OSStatus status = noErr ;
2763 if (m_MB2WC_converter)
2764 status = TECDisposeConverter(m_MB2WC_converter);
2765 if (m_WC2MB_converter)
2766 status = TECDisposeConverter(m_WC2MB_converter);
2767 }
2768
2769 void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2770 TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2771 {
2772 m_MB2WC_converter = NULL ;
2773 m_WC2MB_converter = NULL ;
2774 m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2775 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2776 }
2777
2778 virtual void CreateIfNeeded() const
2779 {
2780 if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2781 {
2782 OSStatus status = noErr ;
2783 status = TECCreateConverter(&m_MB2WC_converter,
2784 m_char_encoding,
2785 m_unicode_encoding);
2786 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2787 status = TECCreateConverter(&m_WC2MB_converter,
2788 m_unicode_encoding,
2789 m_char_encoding);
2790 wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2791 }
2792 }
2793
2794 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2795 {
2796 CreateIfNeeded() ;
2797 OSStatus status = noErr ;
2798 ByteCount byteOutLen ;
2799 ByteCount byteInLen = strlen(psz) + 1;
2800 wchar_t *tbuf = NULL ;
2801 UniChar* ubuf = NULL ;
2802 size_t res = 0 ;
2803
2804 if (buf == NULL)
2805 {
2806 // Apple specs say at least 32
2807 n = wxMax( 32, byteInLen ) ;
2808 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2809 }
2810
2811 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2812
2813 #if SIZEOF_WCHAR_T == 4
2814 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2815 #else
2816 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2817 #endif
2818
2819 status = TECConvertText(
2820 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2821 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2822
2823 #if SIZEOF_WCHAR_T == 4
2824 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2825 // is not properly terminated we get random characters at the end
2826 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2827 wxMBConvUTF16 converter ;
2828 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2829 free( ubuf ) ;
2830 #else
2831 res = byteOutLen / sizeof( UniChar ) ;
2832 #endif
2833
2834 if ( buf == NULL )
2835 free(tbuf) ;
2836
2837 if ( buf && res < n)
2838 buf[res] = 0;
2839
2840 return res ;
2841 }
2842
2843 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2844 {
2845 CreateIfNeeded() ;
2846 OSStatus status = noErr ;
2847 ByteCount byteOutLen ;
2848 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2849
2850 char *tbuf = NULL ;
2851
2852 if (buf == NULL)
2853 {
2854 // Apple specs say at least 32
2855 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2856 tbuf = (char*) malloc( n ) ;
2857 }
2858
2859 ByteCount byteBufferLen = n ;
2860 UniChar* ubuf = NULL ;
2861
2862 #if SIZEOF_WCHAR_T == 4
2863 wxMBConvUTF16 converter ;
2864 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2865 byteInLen = unicharlen ;
2866 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2867 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2868 #else
2869 ubuf = (UniChar*) psz ;
2870 #endif
2871
2872 status = TECConvertText(
2873 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2874 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2875
2876 #if SIZEOF_WCHAR_T == 4
2877 free( ubuf ) ;
2878 #endif
2879
2880 if ( buf == NULL )
2881 free(tbuf) ;
2882
2883 size_t res = byteOutLen ;
2884 if ( buf && res < n)
2885 {
2886 buf[res] = 0;
2887
2888 //we need to double-trip to verify it didn't insert any ? in place
2889 //of bogus characters
2890 wxWCharBuffer wcBuf(n);
2891 size_t pszlen = wxWcslen(psz);
2892 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2893 wxWcslen(wcBuf) != pszlen ||
2894 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2895 {
2896 // we didn't obtain the same thing we started from, hence
2897 // the conversion was lossy and we consider that it failed
2898 return wxCONV_FAILED;
2899 }
2900 }
2901
2902 return res ;
2903 }
2904
2905 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2906
2907 bool IsOk() const
2908 {
2909 CreateIfNeeded() ;
2910 return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2911 }
2912
2913 protected :
2914 mutable TECObjectRef m_MB2WC_converter;
2915 mutable TECObjectRef m_WC2MB_converter;
2916
2917 TextEncodingBase m_char_encoding;
2918 TextEncodingBase m_unicode_encoding;
2919 };
2920
2921 // MB is decomposed (D) normalized UTF8
2922
2923 class wxMBConv_macUTF8D : public wxMBConv_mac
2924 {
2925 public :
2926 wxMBConv_macUTF8D()
2927 {
2928 Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
2929 m_uni = NULL;
2930 m_uniBack = NULL ;
2931 }
2932
2933 virtual ~wxMBConv_macUTF8D()
2934 {
2935 if (m_uni!=NULL)
2936 DisposeUnicodeToTextInfo(&m_uni);
2937 if (m_uniBack!=NULL)
2938 DisposeUnicodeToTextInfo(&m_uniBack);
2939 }
2940
2941 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2942 {
2943 CreateIfNeeded() ;
2944 OSStatus status = noErr ;
2945 ByteCount byteOutLen ;
2946 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2947
2948 char *tbuf = NULL ;
2949
2950 if (buf == NULL)
2951 {
2952 // Apple specs say at least 32
2953 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2954 tbuf = (char*) malloc( n ) ;
2955 }
2956
2957 ByteCount byteBufferLen = n ;
2958 UniChar* ubuf = NULL ;
2959
2960 #if SIZEOF_WCHAR_T == 4
2961 wxMBConvUTF16 converter ;
2962 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2963 byteInLen = unicharlen ;
2964 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2965 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2966 #else
2967 ubuf = (UniChar*) psz ;
2968 #endif
2969
2970 // ubuf is a non-decomposed UniChar buffer
2971
2972 ByteCount dcubuflen = byteInLen * 2 + 2 ;
2973 ByteCount dcubufread , dcubufwritten ;
2974 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2975
2976 ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
2977 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , dcubuf ) ;
2978
2979 // we now convert that decomposed buffer into UTF8
2980
2981 status = TECConvertText(
2982 m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
2983 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2984
2985 free( dcubuf );
2986
2987 #if SIZEOF_WCHAR_T == 4
2988 free( ubuf ) ;
2989 #endif
2990
2991 if ( buf == NULL )
2992 free(tbuf) ;
2993
2994 size_t res = byteOutLen ;
2995 if ( buf && res < n)
2996 {
2997 buf[res] = 0;
2998 // don't test for round-trip fidelity yet, we cannot guarantee it yet
2999 }
3000
3001 return res ;
3002 }
3003
3004 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
3005 {
3006 CreateIfNeeded() ;
3007 OSStatus status = noErr ;
3008 ByteCount byteOutLen ;
3009 ByteCount byteInLen = strlen(psz) + 1;
3010 wchar_t *tbuf = NULL ;
3011 UniChar* ubuf = NULL ;
3012 size_t res = 0 ;
3013
3014 if (buf == NULL)
3015 {
3016 // Apple specs say at least 32
3017 n = wxMax( 32, byteInLen ) ;
3018 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
3019 }
3020
3021 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
3022
3023 #if SIZEOF_WCHAR_T == 4
3024 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
3025 #else
3026 ubuf = (UniChar*) (buf ? buf : tbuf) ;
3027 #endif
3028
3029 ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
3030 ByteCount dcubufread , dcubufwritten ;
3031 UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3032
3033 status = TECConvertText(
3034 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
3035 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
3036 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3037 // is not properly terminated we get random characters at the end
3038 dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3039
3040 // now from the decomposed UniChar to properly composed uniChar
3041 ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
3042 kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , ubuf ) ;
3043
3044 free( dcubuf );
3045 byteOutLen = dcubufwritten ;
3046 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3047
3048
3049 #if SIZEOF_WCHAR_T == 4
3050 wxMBConvUTF16 converter ;
3051 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
3052 free( ubuf ) ;
3053 #else
3054 res = byteOutLen / sizeof( UniChar ) ;
3055 #endif
3056
3057 if ( buf == NULL )
3058 free(tbuf) ;
3059
3060 if ( buf && res < n)
3061 buf[res] = 0;
3062
3063 return res ;
3064 }
3065
3066 virtual void CreateIfNeeded() const
3067 {
3068 wxMBConv_mac::CreateIfNeeded() ;
3069 if ( m_uni == NULL )
3070 {
3071 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3072 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3073 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3074 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
3075 m_map.mappingVersion = kUnicodeUseLatestMapping;
3076
3077 OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
3078 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3079
3080 m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3081 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3082 m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3083 kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
3084 m_map.mappingVersion = kUnicodeUseLatestMapping;
3085 err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
3086 wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3087 }
3088 }
3089 protected :
3090 mutable UnicodeToTextInfo m_uni;
3091 mutable UnicodeToTextInfo m_uniBack;
3092 mutable UnicodeMapping m_map;
3093 };
3094 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3095
3096 // ============================================================================
3097 // wxEncodingConverter based conversion classes
3098 // ============================================================================
3099
3100 #if wxUSE_FONTMAP
3101
3102 class wxMBConv_wxwin : public wxMBConv
3103 {
3104 private:
3105 void Init()
3106 {
3107 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
3108 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
3109 }
3110
3111 public:
3112 // temporarily just use wxEncodingConverter stuff,
3113 // so that it works while a better implementation is built
3114 wxMBConv_wxwin(const char* name)
3115 {
3116 if (name)
3117 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3118 else
3119 m_enc = wxFONTENCODING_SYSTEM;
3120
3121 Init();
3122 }
3123
3124 wxMBConv_wxwin(wxFontEncoding enc)
3125 {
3126 m_enc = enc;
3127
3128 Init();
3129 }
3130
3131 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
3132 {
3133 size_t inbuf = strlen(psz);
3134 if (buf)
3135 {
3136 if (!m2w.Convert(psz, buf))
3137 return wxCONV_FAILED;
3138 }
3139 return inbuf;
3140 }
3141
3142 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
3143 {
3144 const size_t inbuf = wxWcslen(psz);
3145 if (buf)
3146 {
3147 if (!w2m.Convert(psz, buf))
3148 return wxCONV_FAILED;
3149 }
3150
3151 return inbuf;
3152 }
3153
3154 virtual size_t GetMBNulLen() const
3155 {
3156 switch ( m_enc )
3157 {
3158 case wxFONTENCODING_UTF16BE:
3159 case wxFONTENCODING_UTF16LE:
3160 return 2;
3161
3162 case wxFONTENCODING_UTF32BE:
3163 case wxFONTENCODING_UTF32LE:
3164 return 4;
3165
3166 default:
3167 return 1;
3168 }
3169 }
3170
3171 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
3172
3173 bool IsOk() const { return m_ok; }
3174
3175 public:
3176 wxFontEncoding m_enc;
3177 wxEncodingConverter m2w, w2m;
3178
3179 private:
3180 // were we initialized successfully?
3181 bool m_ok;
3182
3183 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
3184 };
3185
3186 // make the constructors available for unit testing
3187 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
3188 {
3189 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
3190 if ( !result->IsOk() )
3191 {
3192 delete result;
3193 return 0;
3194 }
3195
3196 return result;
3197 }
3198
3199 #endif // wxUSE_FONTMAP
3200
3201 // ============================================================================
3202 // wxCSConv implementation
3203 // ============================================================================
3204
3205 void wxCSConv::Init()
3206 {
3207 m_name = NULL;
3208 m_convReal = NULL;
3209 m_deferred = true;
3210 }
3211
3212 wxCSConv::wxCSConv(const wxString& charset)
3213 {
3214 Init();
3215
3216 if ( !charset.empty() )
3217 {
3218 SetName(charset.ToAscii());
3219 }
3220
3221 #if wxUSE_FONTMAP
3222 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3223 #else
3224 m_encoding = wxFONTENCODING_SYSTEM;
3225 #endif
3226 }
3227
3228 wxCSConv::wxCSConv(wxFontEncoding encoding)
3229 {
3230 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3231 {
3232 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3233
3234 encoding = wxFONTENCODING_SYSTEM;
3235 }
3236
3237 Init();
3238
3239 m_encoding = encoding;
3240 }
3241
3242 wxCSConv::~wxCSConv()
3243 {
3244 Clear();
3245 }
3246
3247 wxCSConv::wxCSConv(const wxCSConv& conv)
3248 : wxMBConv()
3249 {
3250 Init();
3251
3252 SetName(conv.m_name);
3253 m_encoding = conv.m_encoding;
3254 }
3255
3256 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3257 {
3258 Clear();
3259
3260 SetName(conv.m_name);
3261 m_encoding = conv.m_encoding;
3262
3263 return *this;
3264 }
3265
3266 void wxCSConv::Clear()
3267 {
3268 free(m_name);
3269 delete m_convReal;
3270
3271 m_name = NULL;
3272 m_convReal = NULL;
3273 }
3274
3275 void wxCSConv::SetName(const char *charset)
3276 {
3277 if (charset)
3278 {
3279 m_name = strdup(charset);
3280 m_deferred = true;
3281 }
3282 }
3283
3284 #if wxUSE_FONTMAP
3285
3286 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3287 wxEncodingNameCache );
3288
3289 static wxEncodingNameCache gs_nameCache;
3290 #endif
3291
3292 wxMBConv *wxCSConv::DoCreate() const
3293 {
3294 #if wxUSE_FONTMAP
3295 wxLogTrace(TRACE_STRCONV,
3296 wxT("creating conversion for %s"),
3297 (m_name ? m_name
3298 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3299 #endif // wxUSE_FONTMAP
3300
3301 // check for the special case of ASCII or ISO8859-1 charset: as we have
3302 // special knowledge of it anyhow, we don't need to create a special
3303 // conversion object
3304 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3305 m_encoding == wxFONTENCODING_DEFAULT )
3306 {
3307 // don't convert at all
3308 return NULL;
3309 }
3310
3311 // we trust OS to do conversion better than we can so try external
3312 // conversion methods first
3313 //
3314 // the full order is:
3315 // 1. OS conversion (iconv() under Unix or Win32 API)
3316 // 2. hard coded conversions for UTF
3317 // 3. wxEncodingConverter as fall back
3318
3319 // step (1)
3320 #ifdef HAVE_ICONV
3321 #if !wxUSE_FONTMAP
3322 if ( m_name )
3323 #endif // !wxUSE_FONTMAP
3324 {
3325 #if wxUSE_FONTMAP
3326 wxFontEncoding encoding(m_encoding);
3327 #endif
3328
3329 if ( m_name )
3330 {
3331 wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3332 if ( conv->IsOk() )
3333 return conv;
3334
3335 delete conv;
3336
3337 #if wxUSE_FONTMAP
3338 encoding =
3339 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3340 #endif // wxUSE_FONTMAP
3341 }
3342 #if wxUSE_FONTMAP
3343 {
3344 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3345 if ( it != gs_nameCache.end() )
3346 {
3347 if ( it->second.empty() )
3348 return NULL;
3349
3350 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3351 if ( conv->IsOk() )
3352 return conv;
3353
3354 delete conv;
3355 }
3356
3357 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3358 // CS : in case this does not return valid names (eg for MacRoman)
3359 // encoding got a 'failure' entry in the cache all the same,
3360 // although it just has to be created using a different method, so
3361 // only store failed iconv creation attempts (or perhaps we
3362 // shoulnd't do this at all ?)
3363 if ( names[0] != NULL )
3364 {
3365 for ( ; *names; ++names )
3366 {
3367 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3368 // will need changes that will obsolete this
3369 wxString name(*names);
3370 wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3371 if ( conv->IsOk() )
3372 {
3373 gs_nameCache[encoding] = *names;
3374 return conv;
3375 }
3376
3377 delete conv;
3378 }
3379
3380 gs_nameCache[encoding] = _T(""); // cache the failure
3381 }
3382 }
3383 #endif // wxUSE_FONTMAP
3384 }
3385 #endif // HAVE_ICONV
3386
3387 #ifdef wxHAVE_WIN32_MB2WC
3388 {
3389 #if wxUSE_FONTMAP
3390 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3391 : new wxMBConv_win32(m_encoding);
3392 if ( conv->IsOk() )
3393 return conv;
3394
3395 delete conv;
3396 #else
3397 return NULL;
3398 #endif
3399 }
3400 #endif // wxHAVE_WIN32_MB2WC
3401
3402 #if defined(__WXMAC__)
3403 {
3404 // leave UTF16 and UTF32 to the built-ins of wx
3405 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3406 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3407 {
3408 #if wxUSE_FONTMAP
3409 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3410 : new wxMBConv_mac(m_encoding);
3411 #else
3412 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3413 #endif
3414 if ( conv->IsOk() )
3415 return conv;
3416
3417 delete conv;
3418 }
3419 }
3420 #endif
3421
3422 #if 0 //defined(__WXCOCOA__)
3423 {
3424 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3425 {
3426 #if wxUSE_FONTMAP
3427 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3428 : new wxMBConv_cocoa(m_encoding);
3429 #else
3430 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3431 #endif
3432
3433 if ( conv->IsOk() )
3434 return conv;
3435
3436 delete conv;
3437 }
3438 }
3439 #endif
3440 // step (2)
3441 wxFontEncoding enc = m_encoding;
3442 #if wxUSE_FONTMAP
3443 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3444 {
3445 // use "false" to suppress interactive dialogs -- we can be called from
3446 // anywhere and popping up a dialog from here is the last thing we want to
3447 // do
3448 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3449 }
3450 #endif // wxUSE_FONTMAP
3451
3452 switch ( enc )
3453 {
3454 case wxFONTENCODING_UTF7:
3455 return new wxMBConvUTF7;
3456
3457 case wxFONTENCODING_UTF8:
3458 return new wxMBConvUTF8;
3459
3460 case wxFONTENCODING_UTF16BE:
3461 return new wxMBConvUTF16BE;
3462
3463 case wxFONTENCODING_UTF16LE:
3464 return new wxMBConvUTF16LE;
3465
3466 case wxFONTENCODING_UTF32BE:
3467 return new wxMBConvUTF32BE;
3468
3469 case wxFONTENCODING_UTF32LE:
3470 return new wxMBConvUTF32LE;
3471
3472 default:
3473 // nothing to do but put here to suppress gcc warnings
3474 break;
3475 }
3476
3477 // step (3)
3478 #if wxUSE_FONTMAP
3479 {
3480 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3481 : new wxMBConv_wxwin(m_encoding);
3482 if ( conv->IsOk() )
3483 return conv;
3484
3485 delete conv;
3486 }
3487 #endif // wxUSE_FONTMAP
3488
3489 // NB: This is a hack to prevent deadlock. What could otherwise happen
3490 // in Unicode build: wxConvLocal creation ends up being here
3491 // because of some failure and logs the error. But wxLog will try to
3492 // attach a timestamp, for which it will need wxConvLocal (to convert
3493 // time to char* and then wchar_t*), but that fails, tries to log the
3494 // error, but wxLog has an (already locked) critical section that
3495 // guards the static buffer.
3496 static bool alreadyLoggingError = false;
3497 if (!alreadyLoggingError)
3498 {
3499 alreadyLoggingError = true;
3500 wxLogError(_("Cannot convert from the charset '%s'!"),
3501 m_name ? m_name
3502 :
3503 #if wxUSE_FONTMAP
3504 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3505 #else // !wxUSE_FONTMAP
3506 (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3507 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3508 );
3509
3510 alreadyLoggingError = false;
3511 }
3512
3513 return NULL;
3514 }
3515
3516 void wxCSConv::CreateConvIfNeeded() const
3517 {
3518 if ( m_deferred )
3519 {
3520 wxCSConv *self = (wxCSConv *)this; // const_cast
3521
3522 // if we don't have neither the name nor the encoding, use the default
3523 // encoding for this system
3524 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3525 {
3526 #if wxUSE_INTL
3527 self->m_encoding = wxLocale::GetSystemEncoding();
3528 #else
3529 // fallback to some reasonable default:
3530 self->m_encoding = wxFONTENCODING_ISO8859_1;
3531 #endif // wxUSE_INTL
3532 }
3533
3534 self->m_convReal = DoCreate();
3535 self->m_deferred = false;
3536 }
3537 }
3538
3539 bool wxCSConv::IsOk() const
3540 {
3541 CreateConvIfNeeded();
3542
3543 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3544 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3545 return true; // always ok as we do it ourselves
3546
3547 // m_convReal->IsOk() is called at its own creation, so we know it must
3548 // be ok if m_convReal is non-NULL
3549 return m_convReal != NULL;
3550 }
3551
3552 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3553 const char *src, size_t srcLen) const
3554 {
3555 CreateConvIfNeeded();
3556
3557 if (m_convReal)
3558 return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3559
3560 // latin-1 (direct)
3561 return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3562 }
3563
3564 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3565 const wchar_t *src, size_t srcLen) const
3566 {
3567 CreateConvIfNeeded();
3568
3569 if (m_convReal)
3570 return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3571
3572 // latin-1 (direct)
3573 return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3574 }
3575
3576 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3577 {
3578 CreateConvIfNeeded();
3579
3580 if (m_convReal)
3581 return m_convReal->MB2WC(buf, psz, n);
3582
3583 // latin-1 (direct)
3584 size_t len = strlen(psz);
3585
3586 if (buf)
3587 {
3588 for (size_t c = 0; c <= len; c++)
3589 buf[c] = (unsigned char)(psz[c]);
3590 }
3591
3592 return len;
3593 }
3594
3595 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3596 {
3597 CreateConvIfNeeded();
3598
3599 if (m_convReal)
3600 return m_convReal->WC2MB(buf, psz, n);
3601
3602 // latin-1 (direct)
3603 const size_t len = wxWcslen(psz);
3604 if (buf)
3605 {
3606 for (size_t c = 0; c <= len; c++)
3607 {
3608 if (psz[c] > 0xFF)
3609 return wxCONV_FAILED;
3610
3611 buf[c] = (char)psz[c];
3612 }
3613 }
3614 else
3615 {
3616 for (size_t c = 0; c <= len; c++)
3617 {
3618 if (psz[c] > 0xFF)
3619 return wxCONV_FAILED;
3620 }
3621 }
3622
3623 return len;
3624 }
3625
3626 size_t wxCSConv::GetMBNulLen() const
3627 {
3628 CreateConvIfNeeded();
3629
3630 if ( m_convReal )
3631 {
3632 return m_convReal->GetMBNulLen();
3633 }
3634
3635 // otherwise, we are ISO-8859-1
3636 return 1;
3637 }
3638
3639 #if wxUSE_UNICODE_UTF8
3640 bool wxCSConv::IsUTF8() const
3641 {
3642 CreateConvIfNeeded();
3643
3644 if ( m_convReal )
3645 {
3646 return m_convReal->IsUTF8();
3647 }
3648
3649 // otherwise, we are ISO-8859-1
3650 return false;
3651 }
3652 #endif
3653
3654
3655 #if wxUSE_UNICODE
3656
3657 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3658 {
3659 if ( !s )
3660 return wxWCharBuffer();
3661
3662 wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3663 if ( !wbuf )
3664 wbuf = wxMBConvUTF8().cMB2WX(s);
3665 if ( !wbuf )
3666 wbuf = wxConvISO8859_1.cMB2WX(s);
3667
3668 return wbuf;
3669 }
3670
3671 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3672 {
3673 if ( !ws )
3674 return wxCharBuffer();
3675
3676 wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3677 if ( !buf )
3678 buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3679
3680 return buf;
3681 }
3682
3683 #endif // wxUSE_UNICODE
3684
3685 // ----------------------------------------------------------------------------
3686 // globals
3687 // ----------------------------------------------------------------------------
3688
3689 // NB: The reason why we create converted objects in this convoluted way,
3690 // using a factory function instead of global variable, is that they
3691 // may be used at static initialization time (some of them are used by
3692 // wxString ctors and there may be a global wxString object). In other
3693 // words, possibly _before_ the converter global object would be
3694 // initialized.
3695
3696 #undef wxConvLibc
3697 #undef wxConvUTF8
3698 #undef wxConvUTF7
3699 #undef wxConvLocal
3700 #undef wxConvISO8859_1
3701
3702 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3703 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3704 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3705 { \
3706 static impl_klass name##Obj ctor_args; \
3707 return &name##Obj; \
3708 } \
3709 /* this ensures that all global converter objects are created */ \
3710 /* by the time static initialization is done, i.e. before any */ \
3711 /* thread is launched: */ \
3712 static klass* gs_##name##instance = wxGet_##name##Ptr()
3713
3714 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3715 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3716
3717 #ifdef __WINDOWS__
3718 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3719 #elif defined(__WXMAC__) && !defined(__MACH__)
3720 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_mac, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3721 #else
3722 WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3723 #endif
3724
3725 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3726 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3727
3728 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3729 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3730
3731 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3732 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3733
3734 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3735 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3736 #endif
3737 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3738 #ifdef __WXOSX__
3739 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3740 &wxConvMacUTF8DObj;
3741 #else
3742 wxGet_wxConvUTF8Ptr();
3743 #endif
3744 #else // !__WXOSX__
3745 wxGet_wxConvLibcPtr();
3746 #endif // __WXOSX__/!__WXOSX__
3747
3748 #else // !wxUSE_WCHAR_T
3749
3750 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3751 // stand-ins in absence of wchar_t
3752 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3753 wxConvISO8859_1,
3754 wxConvLocal,
3755 wxConvUTF8;
3756
3757 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T