don't allocate 0-sized buffer in cWC2MB() even if input size is 0
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifndef WX_PRECOMP
19 #include "wx/intl.h"
20 #include "wx/log.h"
21 #include "wx/utils.h"
22 #endif
23
24 #include "wx/strconv.h"
25
26 #if wxUSE_WCHAR_T
27
28 #ifdef __WINDOWS__
29 #include "wx/msw/private.h"
30 #include "wx/msw/missing.h"
31 #endif
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #define wxHAVE_WIN32_MB2WC
43 #endif
44
45 #ifdef __SALFORDC__
46 #include <clib.h>
47 #endif
48
49 #ifdef HAVE_ICONV
50 #include <iconv.h>
51 #include "wx/thread.h"
52 #endif
53
54 #include "wx/encconv.h"
55 #include "wx/fontmap.h"
56
57 #ifdef __WXMAC__
58 #ifndef __DARWIN__
59 #include <ATSUnicode.h>
60 #include <TextCommon.h>
61 #include <TextEncodingConverter.h>
62 #endif
63
64 // includes Mac headers
65 #include "wx/mac/private.h"
66 #endif
67
68
69 #define TRACE_STRCONV _T("strconv")
70
71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
72 // be 4 bytes
73 #if SIZEOF_WCHAR_T == 2
74 #define WC_UTF16
75 #endif
76
77
78 // ============================================================================
79 // implementation
80 // ============================================================================
81
82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
83 static bool NotAllNULs(const char *p, size_t n)
84 {
85 while ( n && *p++ == '\0' )
86 n--;
87
88 return n != 0;
89 }
90
91 // ----------------------------------------------------------------------------
92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
93 // ----------------------------------------------------------------------------
94
95 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
96 {
97 if (input <= 0xffff)
98 {
99 if (output)
100 *output = (wxUint16) input;
101
102 return 1;
103 }
104 else if (input >= 0x110000)
105 {
106 return wxCONV_FAILED;
107 }
108 else
109 {
110 if (output)
111 {
112 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
113 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
114 }
115
116 return 2;
117 }
118 }
119
120 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
121 {
122 if ((*input < 0xd800) || (*input > 0xdfff))
123 {
124 output = *input;
125 return 1;
126 }
127 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
128 {
129 output = *input;
130 return wxCONV_FAILED;
131 }
132 else
133 {
134 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
135 return 2;
136 }
137 }
138
139 #ifdef WC_UTF16
140 typedef wchar_t wxDecodeSurrogate_t;
141 #else // !WC_UTF16
142 typedef wxUint16 wxDecodeSurrogate_t;
143 #endif // WC_UTF16/!WC_UTF16
144
145 // returns the next UTF-32 character from the wchar_t buffer and advances the
146 // pointer to the character after this one
147 //
148 // if an invalid character is found, *pSrc is set to NULL, the caller must
149 // check for this
150 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
151 {
152 wxUint32 out;
153 const size_t
154 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
155 if ( n == wxCONV_FAILED )
156 *pSrc = NULL;
157 else
158 *pSrc += n;
159
160 return out;
161 }
162
163 // ----------------------------------------------------------------------------
164 // wxMBConv
165 // ----------------------------------------------------------------------------
166
167 size_t
168 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
169 const char *src, size_t srcLen) const
170 {
171 // although new conversion classes are supposed to implement this function
172 // directly, the existins ones only implement the old MB2WC() and so, to
173 // avoid to have to rewrite all conversion classes at once, we provide a
174 // default (but not efficient) implementation of this one in terms of the
175 // old function by copying the input to ensure that it's NUL-terminated and
176 // then using MB2WC() to convert it
177
178 // the number of chars [which would be] written to dst [if it were not NULL]
179 size_t dstWritten = 0;
180
181 // the number of NULs terminating this string
182 size_t nulLen = 0; // not really needed, but just to avoid warnings
183
184 // if we were not given the input size we just have to assume that the
185 // string is properly terminated as we have no way of knowing how long it
186 // is anyhow, but if we do have the size check whether there are enough
187 // NULs at the end
188 wxCharBuffer bufTmp;
189 const char *srcEnd;
190 if ( srcLen != wxNO_LEN )
191 {
192 // we need to know how to find the end of this string
193 nulLen = GetMBNulLen();
194 if ( nulLen == wxCONV_FAILED )
195 return wxCONV_FAILED;
196
197 // if there are enough NULs we can avoid the copy
198 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
199 {
200 // make a copy in order to properly NUL-terminate the string
201 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
202 char * const p = bufTmp.data();
203 memcpy(p, src, srcLen);
204 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
205 *s = '\0';
206
207 src = bufTmp;
208 }
209
210 srcEnd = src + srcLen;
211 }
212 else // quit after the first loop iteration
213 {
214 srcEnd = NULL;
215 }
216
217 for ( ;; )
218 {
219 // try to convert the current chunk
220 size_t lenChunk = MB2WC(NULL, src, 0);
221 if ( lenChunk == wxCONV_FAILED )
222 return wxCONV_FAILED;
223
224 lenChunk++; // for the L'\0' at the end of this chunk
225
226 dstWritten += lenChunk;
227
228 if ( lenChunk == 1 )
229 {
230 // nothing left in the input string, conversion succeeded
231 break;
232 }
233
234 if ( dst )
235 {
236 if ( dstWritten > dstLen )
237 return wxCONV_FAILED;
238
239 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
240 return wxCONV_FAILED;
241
242 dst += lenChunk;
243 }
244
245 if ( !srcEnd )
246 {
247 // we convert just one chunk in this case as this is the entire
248 // string anyhow
249 break;
250 }
251
252 // advance the input pointer past the end of this chunk
253 while ( NotAllNULs(src, nulLen) )
254 {
255 // notice that we must skip over multiple bytes here as we suppose
256 // that if NUL takes 2 or 4 bytes, then all the other characters do
257 // too and so if advanced by a single byte we might erroneously
258 // detect sequences of NUL bytes in the middle of the input
259 src += nulLen;
260 }
261
262 src += nulLen; // skipping over its terminator as well
263
264 // note that ">=" (and not just "==") is needed here as the terminator
265 // we skipped just above could be inside or just after the buffer
266 // delimited by inEnd
267 if ( src >= srcEnd )
268 break;
269 }
270
271 return dstWritten;
272 }
273
274 size_t
275 wxMBConv::FromWChar(char *dst, size_t dstLen,
276 const wchar_t *src, size_t srcLen) const
277 {
278 // the number of chars [which would be] written to dst [if it were not NULL]
279 size_t dstWritten = 0;
280
281 // make a copy of the input string unless it is already properly
282 // NUL-terminated
283 //
284 // if we don't know its length we have no choice but to assume that it is,
285 // indeed, properly terminated
286 wxWCharBuffer bufTmp;
287 if ( srcLen == wxNO_LEN )
288 {
289 srcLen = wxWcslen(src) + 1;
290 }
291 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
292 {
293 // make a copy in order to properly NUL-terminate the string
294 bufTmp = wxWCharBuffer(srcLen);
295 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
296 src = bufTmp;
297 }
298
299 const size_t lenNul = GetMBNulLen();
300 for ( const wchar_t * const srcEnd = src + srcLen;
301 src < srcEnd;
302 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
303 {
304 // try to convert the current chunk
305 size_t lenChunk = WC2MB(NULL, src, 0);
306
307 if ( lenChunk == wxCONV_FAILED )
308 return wxCONV_FAILED;
309
310 lenChunk += lenNul;
311 dstWritten += lenChunk;
312
313 if ( dst )
314 {
315 if ( dstWritten > dstLen )
316 return wxCONV_FAILED;
317
318 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
319 return wxCONV_FAILED;
320
321 dst += lenChunk;
322 }
323 }
324
325 return dstWritten;
326 }
327
328 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
329 {
330 size_t rc = ToWChar(outBuff, outLen, inBuff);
331 if ( rc != wxCONV_FAILED )
332 {
333 // ToWChar() returns the buffer length, i.e. including the trailing
334 // NUL, while this method doesn't take it into account
335 rc--;
336 }
337
338 return rc;
339 }
340
341 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
342 {
343 size_t rc = FromWChar(outBuff, outLen, inBuff);
344 if ( rc != wxCONV_FAILED )
345 {
346 rc -= GetMBNulLen();
347 }
348
349 return rc;
350 }
351
352 wxMBConv::~wxMBConv()
353 {
354 // nothing to do here (necessary for Darwin linking probably)
355 }
356
357 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
358 {
359 if ( psz )
360 {
361 // calculate the length of the buffer needed first
362 const size_t nLen = MB2WC(NULL, psz, 0);
363 if ( nLen != wxCONV_FAILED )
364 {
365 // now do the actual conversion
366 wxWCharBuffer buf(nLen /* +1 added implicitly */);
367
368 // +1 for the trailing NULL
369 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
370 return buf;
371 }
372 }
373
374 return wxWCharBuffer();
375 }
376
377 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
378 {
379 if ( pwz )
380 {
381 const size_t nLen = WC2MB(NULL, pwz, 0);
382 if ( nLen != wxCONV_FAILED )
383 {
384 // extra space for trailing NUL(s)
385 static const size_t extraLen = GetMaxMBNulLen();
386
387 wxCharBuffer buf(nLen + extraLen - 1);
388 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
389 return buf;
390 }
391 }
392
393 return wxCharBuffer();
394 }
395
396 const wxWCharBuffer
397 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
398 {
399 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
400 if ( dstLen != wxCONV_FAILED )
401 {
402 wxWCharBuffer wbuf(dstLen - 1);
403 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
404 {
405 if ( outLen )
406 {
407 *outLen = dstLen;
408 if ( wbuf[dstLen - 1] == L'\0' )
409 (*outLen)--;
410 }
411
412 return wbuf;
413 }
414 }
415
416 if ( outLen )
417 *outLen = 0;
418
419 return wxWCharBuffer();
420 }
421
422 const wxCharBuffer
423 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
424 {
425 size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
426 if ( dstLen != wxCONV_FAILED )
427 {
428 if ( !dstLen )
429 {
430 // special case: can't allocate 0 size buffer below
431 dstLen++;
432 }
433
434 wxCharBuffer buf(dstLen - 1);
435 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
436 {
437 if ( outLen )
438 {
439 *outLen = dstLen;
440
441 const size_t nulLen = GetMBNulLen();
442 if ( dstLen >= nulLen &&
443 !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
444 {
445 // in this case the output is NUL-terminated and we're not
446 // supposed to count NUL
447 *outLen -= nulLen;
448 }
449 }
450
451 return buf;
452 }
453 }
454
455 if ( outLen )
456 *outLen = 0;
457
458 return wxCharBuffer();
459 }
460
461 // ----------------------------------------------------------------------------
462 // wxMBConvLibc
463 // ----------------------------------------------------------------------------
464
465 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
466 {
467 return wxMB2WC(buf, psz, n);
468 }
469
470 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
471 {
472 return wxWC2MB(buf, psz, n);
473 }
474
475 // ----------------------------------------------------------------------------
476 // wxConvBrokenFileNames
477 // ----------------------------------------------------------------------------
478
479 #ifdef __UNIX__
480
481 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
482 {
483 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
484 || wxStricmp(charset, _T("UTF8")) == 0 )
485 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
486 else
487 m_conv = new wxCSConv(charset);
488 }
489
490 #endif // __UNIX__
491
492 // ----------------------------------------------------------------------------
493 // UTF-7
494 // ----------------------------------------------------------------------------
495
496 // Implementation (C) 2004 Fredrik Roubert
497
498 //
499 // BASE64 decoding table
500 //
501 static const unsigned char utf7unb64[] =
502 {
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
506 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
509 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
510 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
512 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
513 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
514 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
516 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
517 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
518 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
535 };
536
537 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
538 {
539 size_t len = 0;
540
541 while ( *psz && (!buf || (len < n)) )
542 {
543 unsigned char cc = *psz++;
544 if (cc != '+')
545 {
546 // plain ASCII char
547 if (buf)
548 *buf++ = cc;
549 len++;
550 }
551 else if (*psz == '-')
552 {
553 // encoded plus sign
554 if (buf)
555 *buf++ = cc;
556 len++;
557 psz++;
558 }
559 else // start of BASE64 encoded string
560 {
561 bool lsb, ok;
562 unsigned int d, l;
563 for ( ok = lsb = false, d = 0, l = 0;
564 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
565 psz++ )
566 {
567 d <<= 6;
568 d += cc;
569 for (l += 6; l >= 8; lsb = !lsb)
570 {
571 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
572 if (lsb)
573 {
574 if (buf)
575 *buf++ |= c;
576 len ++;
577 }
578 else
579 {
580 if (buf)
581 *buf = (wchar_t)(c << 8);
582 }
583
584 ok = true;
585 }
586 }
587
588 if ( !ok )
589 {
590 // in valid UTF7 we should have valid characters after '+'
591 return wxCONV_FAILED;
592 }
593
594 if (*psz == '-')
595 psz++;
596 }
597 }
598
599 if ( buf && (len < n) )
600 *buf = '\0';
601
602 return len;
603 }
604
605 //
606 // BASE64 encoding table
607 //
608 static const unsigned char utf7enb64[] =
609 {
610 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
611 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
612 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
613 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
614 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
615 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
616 'w', 'x', 'y', 'z', '0', '1', '2', '3',
617 '4', '5', '6', '7', '8', '9', '+', '/'
618 };
619
620 //
621 // UTF-7 encoding table
622 //
623 // 0 - Set D (directly encoded characters)
624 // 1 - Set O (optional direct characters)
625 // 2 - whitespace characters (optional)
626 // 3 - special characters
627 //
628 static const unsigned char utf7encode[128] =
629 {
630 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
631 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
632 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
634 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
635 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
636 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
638 };
639
640 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
641 {
642 size_t len = 0;
643
644 while (*psz && ((!buf) || (len < n)))
645 {
646 wchar_t cc = *psz++;
647 if (cc < 0x80 && utf7encode[cc] < 1)
648 {
649 // plain ASCII char
650 if (buf)
651 *buf++ = (char)cc;
652
653 len++;
654 }
655 #ifndef WC_UTF16
656 else if (((wxUint32)cc) > 0xffff)
657 {
658 // no surrogate pair generation (yet?)
659 return wxCONV_FAILED;
660 }
661 #endif
662 else
663 {
664 if (buf)
665 *buf++ = '+';
666
667 len++;
668 if (cc != '+')
669 {
670 // BASE64 encode string
671 unsigned int lsb, d, l;
672 for (d = 0, l = 0; /*nothing*/; psz++)
673 {
674 for (lsb = 0; lsb < 2; lsb ++)
675 {
676 d <<= 8;
677 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
678
679 for (l += 8; l >= 6; )
680 {
681 l -= 6;
682 if (buf)
683 *buf++ = utf7enb64[(d >> l) % 64];
684 len++;
685 }
686 }
687
688 cc = *psz;
689 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
690 break;
691 }
692
693 if (l != 0)
694 {
695 if (buf)
696 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
697
698 len++;
699 }
700 }
701
702 if (buf)
703 *buf++ = '-';
704 len++;
705 }
706 }
707
708 if (buf && (len < n))
709 *buf = 0;
710
711 return len;
712 }
713
714 // ----------------------------------------------------------------------------
715 // UTF-8
716 // ----------------------------------------------------------------------------
717
718 static wxUint32 utf8_max[]=
719 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
720
721 // boundaries of the private use area we use to (temporarily) remap invalid
722 // characters invalid in a UTF-8 encoded string
723 const wxUint32 wxUnicodePUA = 0x100000;
724 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
725
726 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
727 {
728 size_t len = 0;
729
730 while (*psz && ((!buf) || (len < n)))
731 {
732 const char *opsz = psz;
733 bool invalid = false;
734 unsigned char cc = *psz++, fc = cc;
735 unsigned cnt;
736 for (cnt = 0; fc & 0x80; cnt++)
737 fc <<= 1;
738
739 if (!cnt)
740 {
741 // plain ASCII char
742 if (buf)
743 *buf++ = cc;
744 len++;
745
746 // escape the escape character for octal escapes
747 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
748 && cc == '\\' && (!buf || len < n))
749 {
750 if (buf)
751 *buf++ = cc;
752 len++;
753 }
754 }
755 else
756 {
757 cnt--;
758 if (!cnt)
759 {
760 // invalid UTF-8 sequence
761 invalid = true;
762 }
763 else
764 {
765 unsigned ocnt = cnt - 1;
766 wxUint32 res = cc & (0x3f >> cnt);
767 while (cnt--)
768 {
769 cc = *psz;
770 if ((cc & 0xC0) != 0x80)
771 {
772 // invalid UTF-8 sequence
773 invalid = true;
774 break;
775 }
776
777 psz++;
778 res = (res << 6) | (cc & 0x3f);
779 }
780
781 if (invalid || res <= utf8_max[ocnt])
782 {
783 // illegal UTF-8 encoding
784 invalid = true;
785 }
786 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
787 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
788 {
789 // if one of our PUA characters turns up externally
790 // it must also be treated as an illegal sequence
791 // (a bit like you have to escape an escape character)
792 invalid = true;
793 }
794 else
795 {
796 #ifdef WC_UTF16
797 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
798 size_t pa = encode_utf16(res, (wxUint16 *)buf);
799 if (pa == wxCONV_FAILED)
800 {
801 invalid = true;
802 }
803 else
804 {
805 if (buf)
806 buf += pa;
807 len += pa;
808 }
809 #else // !WC_UTF16
810 if (buf)
811 *buf++ = (wchar_t)res;
812 len++;
813 #endif // WC_UTF16/!WC_UTF16
814 }
815 }
816
817 if (invalid)
818 {
819 if (m_options & MAP_INVALID_UTF8_TO_PUA)
820 {
821 while (opsz < psz && (!buf || len < n))
822 {
823 #ifdef WC_UTF16
824 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
825 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
826 wxASSERT(pa != wxCONV_FAILED);
827 if (buf)
828 buf += pa;
829 opsz++;
830 len += pa;
831 #else
832 if (buf)
833 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
834 opsz++;
835 len++;
836 #endif
837 }
838 }
839 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
840 {
841 while (opsz < psz && (!buf || len < n))
842 {
843 if ( buf && len + 3 < n )
844 {
845 unsigned char on = *opsz;
846 *buf++ = L'\\';
847 *buf++ = (wchar_t)( L'0' + on / 0100 );
848 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
849 *buf++ = (wchar_t)( L'0' + on % 010 );
850 }
851
852 opsz++;
853 len += 4;
854 }
855 }
856 else // MAP_INVALID_UTF8_NOT
857 {
858 return wxCONV_FAILED;
859 }
860 }
861 }
862 }
863
864 if (buf && (len < n))
865 *buf = 0;
866
867 return len;
868 }
869
870 static inline bool isoctal(wchar_t wch)
871 {
872 return L'0' <= wch && wch <= L'7';
873 }
874
875 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
876 {
877 size_t len = 0;
878
879 while (*psz && ((!buf) || (len < n)))
880 {
881 wxUint32 cc;
882
883 #ifdef WC_UTF16
884 // cast is ok for WC_UTF16
885 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
886 psz += (pa == wxCONV_FAILED) ? 1 : pa;
887 #else
888 cc = (*psz++) & 0x7fffffff;
889 #endif
890
891 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
892 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
893 {
894 if (buf)
895 *buf++ = (char)(cc - wxUnicodePUA);
896 len++;
897 }
898 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
899 && cc == L'\\' && psz[0] == L'\\' )
900 {
901 if (buf)
902 *buf++ = (char)cc;
903 psz++;
904 len++;
905 }
906 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
907 cc == L'\\' &&
908 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
909 {
910 if (buf)
911 {
912 *buf++ = (char) ((psz[0] - L'0') * 0100 +
913 (psz[1] - L'0') * 010 +
914 (psz[2] - L'0'));
915 }
916
917 psz += 3;
918 len++;
919 }
920 else
921 {
922 unsigned cnt;
923 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
924 {
925 }
926
927 if (!cnt)
928 {
929 // plain ASCII char
930 if (buf)
931 *buf++ = (char) cc;
932 len++;
933 }
934 else
935 {
936 len += cnt + 1;
937 if (buf)
938 {
939 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
940 while (cnt--)
941 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
942 }
943 }
944 }
945 }
946
947 if (buf && (len < n))
948 *buf = 0;
949
950 return len;
951 }
952
953 // ============================================================================
954 // UTF-16
955 // ============================================================================
956
957 #ifdef WORDS_BIGENDIAN
958 #define wxMBConvUTF16straight wxMBConvUTF16BE
959 #define wxMBConvUTF16swap wxMBConvUTF16LE
960 #else
961 #define wxMBConvUTF16swap wxMBConvUTF16BE
962 #define wxMBConvUTF16straight wxMBConvUTF16LE
963 #endif
964
965 /* static */
966 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
967 {
968 if ( srcLen == wxNO_LEN )
969 {
970 // count the number of bytes in input, including the trailing NULs
971 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
972 for ( srcLen = 1; *inBuff++; srcLen++ )
973 ;
974
975 srcLen *= BYTES_PER_CHAR;
976 }
977 else // we already have the length
978 {
979 // we can only convert an entire number of UTF-16 characters
980 if ( srcLen % BYTES_PER_CHAR )
981 return wxCONV_FAILED;
982 }
983
984 return srcLen;
985 }
986
987 // case when in-memory representation is UTF-16 too
988 #ifdef WC_UTF16
989
990 // ----------------------------------------------------------------------------
991 // conversions without endianness change
992 // ----------------------------------------------------------------------------
993
994 size_t
995 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
996 const char *src, size_t srcLen) const
997 {
998 // set up the scene for using memcpy() (which is presumably more efficient
999 // than copying the bytes one by one)
1000 srcLen = GetLength(src, srcLen);
1001 if ( srcLen == wxNO_LEN )
1002 return wxCONV_FAILED;
1003
1004 const size_t inLen = srcLen / BYTES_PER_CHAR;
1005 if ( dst )
1006 {
1007 if ( dstLen < inLen )
1008 return wxCONV_FAILED;
1009
1010 memcpy(dst, src, srcLen);
1011 }
1012
1013 return inLen;
1014 }
1015
1016 size_t
1017 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1018 const wchar_t *src, size_t srcLen) const
1019 {
1020 if ( srcLen == wxNO_LEN )
1021 srcLen = wxWcslen(src) + 1;
1022
1023 srcLen *= BYTES_PER_CHAR;
1024
1025 if ( dst )
1026 {
1027 if ( dstLen < srcLen )
1028 return wxCONV_FAILED;
1029
1030 memcpy(dst, src, srcLen);
1031 }
1032
1033 return srcLen;
1034 }
1035
1036 // ----------------------------------------------------------------------------
1037 // endian-reversing conversions
1038 // ----------------------------------------------------------------------------
1039
1040 size_t
1041 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1042 const char *src, size_t srcLen) const
1043 {
1044 srcLen = GetLength(src, srcLen);
1045 if ( srcLen == wxNO_LEN )
1046 return wxCONV_FAILED;
1047
1048 srcLen /= BYTES_PER_CHAR;
1049
1050 if ( dst )
1051 {
1052 if ( dstLen < srcLen )
1053 return wxCONV_FAILED;
1054
1055 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1056 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1057 {
1058 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1059 }
1060 }
1061
1062 return srcLen;
1063 }
1064
1065 size_t
1066 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1067 const wchar_t *src, size_t srcLen) const
1068 {
1069 if ( srcLen == wxNO_LEN )
1070 srcLen = wxWcslen(src) + 1;
1071
1072 srcLen *= BYTES_PER_CHAR;
1073
1074 if ( dst )
1075 {
1076 if ( dstLen < srcLen )
1077 return wxCONV_FAILED;
1078
1079 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1080 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1081 {
1082 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1083 }
1084 }
1085
1086 return srcLen;
1087 }
1088
1089 #else // !WC_UTF16: wchar_t is UTF-32
1090
1091 // ----------------------------------------------------------------------------
1092 // conversions without endianness change
1093 // ----------------------------------------------------------------------------
1094
1095 size_t
1096 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1097 const char *src, size_t srcLen) const
1098 {
1099 srcLen = GetLength(src, srcLen);
1100 if ( srcLen == wxNO_LEN )
1101 return wxCONV_FAILED;
1102
1103 const size_t inLen = srcLen / BYTES_PER_CHAR;
1104 if ( !dst )
1105 {
1106 // optimization: return maximal space which could be needed for this
1107 // string even if the real size could be smaller if the buffer contains
1108 // any surrogates
1109 return inLen;
1110 }
1111
1112 size_t outLen = 0;
1113 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1114 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1115 {
1116 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1117 if ( !inBuff )
1118 return wxCONV_FAILED;
1119
1120 if ( ++outLen > dstLen )
1121 return wxCONV_FAILED;
1122
1123 *dst++ = ch;
1124 }
1125
1126
1127 return outLen;
1128 }
1129
1130 size_t
1131 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1132 const wchar_t *src, size_t srcLen) const
1133 {
1134 if ( srcLen == wxNO_LEN )
1135 srcLen = wxWcslen(src) + 1;
1136
1137 size_t outLen = 0;
1138 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1139 for ( size_t n = 0; n < srcLen; n++ )
1140 {
1141 wxUint16 cc[2];
1142 const size_t numChars = encode_utf16(*src++, cc);
1143 if ( numChars == wxCONV_FAILED )
1144 return wxCONV_FAILED;
1145
1146 outLen += numChars * BYTES_PER_CHAR;
1147 if ( outBuff )
1148 {
1149 if ( outLen > dstLen )
1150 return wxCONV_FAILED;
1151
1152 *outBuff++ = cc[0];
1153 if ( numChars == 2 )
1154 {
1155 // second character of a surrogate
1156 *outBuff++ = cc[1];
1157 }
1158 }
1159 }
1160
1161 return outLen;
1162 }
1163
1164 // ----------------------------------------------------------------------------
1165 // endian-reversing conversions
1166 // ----------------------------------------------------------------------------
1167
1168 size_t
1169 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1170 const char *src, size_t srcLen) const
1171 {
1172 srcLen = GetLength(src, srcLen);
1173 if ( srcLen == wxNO_LEN )
1174 return wxCONV_FAILED;
1175
1176 const size_t inLen = srcLen / BYTES_PER_CHAR;
1177 if ( !dst )
1178 {
1179 // optimization: return maximal space which could be needed for this
1180 // string even if the real size could be smaller if the buffer contains
1181 // any surrogates
1182 return inLen;
1183 }
1184
1185 size_t outLen = 0;
1186 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1187 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1188 {
1189 wxUint32 ch;
1190 wxUint16 tmp[2];
1191
1192 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1193 inBuff++;
1194 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1195
1196 const size_t numChars = decode_utf16(tmp, ch);
1197 if ( numChars == wxCONV_FAILED )
1198 return wxCONV_FAILED;
1199
1200 if ( numChars == 2 )
1201 inBuff++;
1202
1203 if ( ++outLen > dstLen )
1204 return wxCONV_FAILED;
1205
1206 *dst++ = ch;
1207 }
1208
1209
1210 return outLen;
1211 }
1212
1213 size_t
1214 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1215 const wchar_t *src, size_t srcLen) const
1216 {
1217 if ( srcLen == wxNO_LEN )
1218 srcLen = wxWcslen(src) + 1;
1219
1220 size_t outLen = 0;
1221 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1222 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1223 {
1224 wxUint16 cc[2];
1225 const size_t numChars = encode_utf16(*src, cc);
1226 if ( numChars == wxCONV_FAILED )
1227 return wxCONV_FAILED;
1228
1229 outLen += numChars * BYTES_PER_CHAR;
1230 if ( outBuff )
1231 {
1232 if ( outLen > dstLen )
1233 return wxCONV_FAILED;
1234
1235 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1236 if ( numChars == 2 )
1237 {
1238 // second character of a surrogate
1239 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1240 }
1241 }
1242 }
1243
1244 return outLen;
1245 }
1246
1247 #endif // WC_UTF16/!WC_UTF16
1248
1249
1250 // ============================================================================
1251 // UTF-32
1252 // ============================================================================
1253
1254 #ifdef WORDS_BIGENDIAN
1255 #define wxMBConvUTF32straight wxMBConvUTF32BE
1256 #define wxMBConvUTF32swap wxMBConvUTF32LE
1257 #else
1258 #define wxMBConvUTF32swap wxMBConvUTF32BE
1259 #define wxMBConvUTF32straight wxMBConvUTF32LE
1260 #endif
1261
1262
1263 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1264 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1265
1266 /* static */
1267 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1268 {
1269 if ( srcLen == wxNO_LEN )
1270 {
1271 // count the number of bytes in input, including the trailing NULs
1272 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1273 for ( srcLen = 1; *inBuff++; srcLen++ )
1274 ;
1275
1276 srcLen *= BYTES_PER_CHAR;
1277 }
1278 else // we already have the length
1279 {
1280 // we can only convert an entire number of UTF-32 characters
1281 if ( srcLen % BYTES_PER_CHAR )
1282 return wxCONV_FAILED;
1283 }
1284
1285 return srcLen;
1286 }
1287
1288 // case when in-memory representation is UTF-16
1289 #ifdef WC_UTF16
1290
1291 // ----------------------------------------------------------------------------
1292 // conversions without endianness change
1293 // ----------------------------------------------------------------------------
1294
1295 size_t
1296 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1297 const char *src, size_t srcLen) const
1298 {
1299 srcLen = GetLength(src, srcLen);
1300 if ( srcLen == wxNO_LEN )
1301 return wxCONV_FAILED;
1302
1303 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1304 const size_t inLen = srcLen / BYTES_PER_CHAR;
1305 size_t outLen = 0;
1306 for ( size_t n = 0; n < inLen; n++ )
1307 {
1308 wxUint16 cc[2];
1309 const size_t numChars = encode_utf16(*inBuff++, cc);
1310 if ( numChars == wxCONV_FAILED )
1311 return wxCONV_FAILED;
1312
1313 outLen += numChars;
1314 if ( dst )
1315 {
1316 if ( outLen > dstLen )
1317 return wxCONV_FAILED;
1318
1319 *dst++ = cc[0];
1320 if ( numChars == 2 )
1321 {
1322 // second character of a surrogate
1323 *dst++ = cc[1];
1324 }
1325 }
1326 }
1327
1328 return outLen;
1329 }
1330
1331 size_t
1332 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1333 const wchar_t *src, size_t srcLen) const
1334 {
1335 if ( srcLen == wxNO_LEN )
1336 srcLen = wxWcslen(src) + 1;
1337
1338 if ( !dst )
1339 {
1340 // optimization: return maximal space which could be needed for this
1341 // string instead of the exact amount which could be less if there are
1342 // any surrogates in the input
1343 //
1344 // we consider that surrogates are rare enough to make it worthwhile to
1345 // avoid running the loop below at the cost of slightly extra memory
1346 // consumption
1347 return srcLen * BYTES_PER_CHAR;
1348 }
1349
1350 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1351 size_t outLen = 0;
1352 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1353 {
1354 const wxUint32 ch = wxDecodeSurrogate(&src);
1355 if ( !src )
1356 return wxCONV_FAILED;
1357
1358 outLen += BYTES_PER_CHAR;
1359
1360 if ( outLen > dstLen )
1361 return wxCONV_FAILED;
1362
1363 *outBuff++ = ch;
1364 }
1365
1366 return outLen;
1367 }
1368
1369 // ----------------------------------------------------------------------------
1370 // endian-reversing conversions
1371 // ----------------------------------------------------------------------------
1372
1373 size_t
1374 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1375 const char *src, size_t srcLen) const
1376 {
1377 srcLen = GetLength(src, srcLen);
1378 if ( srcLen == wxNO_LEN )
1379 return wxCONV_FAILED;
1380
1381 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1382 const size_t inLen = srcLen / BYTES_PER_CHAR;
1383 size_t outLen = 0;
1384 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1385 {
1386 wxUint16 cc[2];
1387 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1388 if ( numChars == wxCONV_FAILED )
1389 return wxCONV_FAILED;
1390
1391 outLen += numChars;
1392 if ( dst )
1393 {
1394 if ( outLen > dstLen )
1395 return wxCONV_FAILED;
1396
1397 *dst++ = cc[0];
1398 if ( numChars == 2 )
1399 {
1400 // second character of a surrogate
1401 *dst++ = cc[1];
1402 }
1403 }
1404 }
1405
1406 return outLen;
1407 }
1408
1409 size_t
1410 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1411 const wchar_t *src, size_t srcLen) const
1412 {
1413 if ( srcLen == wxNO_LEN )
1414 srcLen = wxWcslen(src) + 1;
1415
1416 if ( !dst )
1417 {
1418 // optimization: return maximal space which could be needed for this
1419 // string instead of the exact amount which could be less if there are
1420 // any surrogates in the input
1421 //
1422 // we consider that surrogates are rare enough to make it worthwhile to
1423 // avoid running the loop below at the cost of slightly extra memory
1424 // consumption
1425 return srcLen*BYTES_PER_CHAR;
1426 }
1427
1428 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1429 size_t outLen = 0;
1430 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1431 {
1432 const wxUint32 ch = wxDecodeSurrogate(&src);
1433 if ( !src )
1434 return wxCONV_FAILED;
1435
1436 outLen += BYTES_PER_CHAR;
1437
1438 if ( outLen > dstLen )
1439 return wxCONV_FAILED;
1440
1441 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1442 }
1443
1444 return outLen;
1445 }
1446
1447 #else // !WC_UTF16: wchar_t is UTF-32
1448
1449 // ----------------------------------------------------------------------------
1450 // conversions without endianness change
1451 // ----------------------------------------------------------------------------
1452
1453 size_t
1454 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1455 const char *src, size_t srcLen) const
1456 {
1457 // use memcpy() as it should be much faster than hand-written loop
1458 srcLen = GetLength(src, srcLen);
1459 if ( srcLen == wxNO_LEN )
1460 return wxCONV_FAILED;
1461
1462 const size_t inLen = srcLen/BYTES_PER_CHAR;
1463 if ( dst )
1464 {
1465 if ( dstLen < inLen )
1466 return wxCONV_FAILED;
1467
1468 memcpy(dst, src, srcLen);
1469 }
1470
1471 return inLen;
1472 }
1473
1474 size_t
1475 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1476 const wchar_t *src, size_t srcLen) const
1477 {
1478 if ( srcLen == wxNO_LEN )
1479 srcLen = wxWcslen(src) + 1;
1480
1481 srcLen *= BYTES_PER_CHAR;
1482
1483 if ( dst )
1484 {
1485 if ( dstLen < srcLen )
1486 return wxCONV_FAILED;
1487
1488 memcpy(dst, src, srcLen);
1489 }
1490
1491 return srcLen;
1492 }
1493
1494 // ----------------------------------------------------------------------------
1495 // endian-reversing conversions
1496 // ----------------------------------------------------------------------------
1497
1498 size_t
1499 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1500 const char *src, size_t srcLen) const
1501 {
1502 srcLen = GetLength(src, srcLen);
1503 if ( srcLen == wxNO_LEN )
1504 return wxCONV_FAILED;
1505
1506 srcLen /= BYTES_PER_CHAR;
1507
1508 if ( dst )
1509 {
1510 if ( dstLen < srcLen )
1511 return wxCONV_FAILED;
1512
1513 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1514 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1515 {
1516 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1517 }
1518 }
1519
1520 return srcLen;
1521 }
1522
1523 size_t
1524 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1525 const wchar_t *src, size_t srcLen) const
1526 {
1527 if ( srcLen == wxNO_LEN )
1528 srcLen = wxWcslen(src) + 1;
1529
1530 srcLen *= BYTES_PER_CHAR;
1531
1532 if ( dst )
1533 {
1534 if ( dstLen < srcLen )
1535 return wxCONV_FAILED;
1536
1537 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1538 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1539 {
1540 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1541 }
1542 }
1543
1544 return srcLen;
1545 }
1546
1547 #endif // WC_UTF16/!WC_UTF16
1548
1549
1550 // ============================================================================
1551 // The classes doing conversion using the iconv_xxx() functions
1552 // ============================================================================
1553
1554 #ifdef HAVE_ICONV
1555
1556 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1557 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1558 // (unless there's yet another bug in glibc) the only case when iconv()
1559 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1560 // left in the input buffer -- when _real_ error occurs,
1561 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1562 // iconv() failure.
1563 // [This bug does not appear in glibc 2.2.]
1564 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1565 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1566 (errno != E2BIG || bufLeft != 0))
1567 #else
1568 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1569 #endif
1570
1571 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1572
1573 #define ICONV_T_INVALID ((iconv_t)-1)
1574
1575 #if SIZEOF_WCHAR_T == 4
1576 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1577 #define WC_ENC wxFONTENCODING_UTF32
1578 #elif SIZEOF_WCHAR_T == 2
1579 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1580 #define WC_ENC wxFONTENCODING_UTF16
1581 #else // sizeof(wchar_t) != 2 nor 4
1582 // does this ever happen?
1583 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1584 #endif
1585
1586 // ----------------------------------------------------------------------------
1587 // wxMBConv_iconv: encapsulates an iconv character set
1588 // ----------------------------------------------------------------------------
1589
1590 class wxMBConv_iconv : public wxMBConv
1591 {
1592 public:
1593 wxMBConv_iconv(const wxChar *name);
1594 virtual ~wxMBConv_iconv();
1595
1596 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1597 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1598
1599 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1600 virtual size_t GetMBNulLen() const;
1601
1602 virtual wxMBConv *Clone() const
1603 {
1604 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1605 p->m_minMBCharWidth = m_minMBCharWidth;
1606 return p;
1607 }
1608
1609 bool IsOk() const
1610 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1611
1612 protected:
1613 // the iconv handlers used to translate from multibyte
1614 // to wide char and in the other direction
1615 iconv_t m2w,
1616 w2m;
1617
1618 #if wxUSE_THREADS
1619 // guards access to m2w and w2m objects
1620 wxMutex m_iconvMutex;
1621 #endif
1622
1623 private:
1624 // the name (for iconv_open()) of a wide char charset -- if none is
1625 // available on this machine, it will remain NULL
1626 static wxString ms_wcCharsetName;
1627
1628 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1629 // different endian-ness than the native one
1630 static bool ms_wcNeedsSwap;
1631
1632
1633 // name of the encoding handled by this conversion
1634 wxString m_name;
1635
1636 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1637 // initially
1638 size_t m_minMBCharWidth;
1639 };
1640
1641 // make the constructor available for unit testing
1642 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1643 {
1644 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1645 if ( !result->IsOk() )
1646 {
1647 delete result;
1648 return 0;
1649 }
1650
1651 return result;
1652 }
1653
1654 wxString wxMBConv_iconv::ms_wcCharsetName;
1655 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1656
1657 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1658 : m_name(name)
1659 {
1660 m_minMBCharWidth = 0;
1661
1662 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1663 // names for the charsets
1664 const wxCharBuffer cname(wxString(name).ToAscii());
1665
1666 // check for charset that represents wchar_t:
1667 if ( ms_wcCharsetName.empty() )
1668 {
1669 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1670
1671 #if wxUSE_FONTMAP
1672 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1673 #else // !wxUSE_FONTMAP
1674 static const wxChar *names[] =
1675 {
1676 #if SIZEOF_WCHAR_T == 4
1677 _T("UCS-4"),
1678 #elif SIZEOF_WCHAR_T = 2
1679 _T("UCS-2"),
1680 #endif
1681 NULL
1682 };
1683 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1684
1685 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1686 {
1687 const wxString nameCS(*names);
1688
1689 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1690 wxString nameXE(nameCS);
1691
1692 #ifdef WORDS_BIGENDIAN
1693 nameXE += _T("BE");
1694 #else // little endian
1695 nameXE += _T("LE");
1696 #endif
1697
1698 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1699 nameXE.c_str());
1700
1701 m2w = iconv_open(nameXE.ToAscii(), cname);
1702 if ( m2w == ICONV_T_INVALID )
1703 {
1704 // try charset w/o bytesex info (e.g. "UCS4")
1705 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1706 nameCS.c_str());
1707 m2w = iconv_open(nameCS.ToAscii(), cname);
1708
1709 // and check for bytesex ourselves:
1710 if ( m2w != ICONV_T_INVALID )
1711 {
1712 char buf[2], *bufPtr;
1713 wchar_t wbuf[2], *wbufPtr;
1714 size_t insz, outsz;
1715 size_t res;
1716
1717 buf[0] = 'A';
1718 buf[1] = 0;
1719 wbuf[0] = 0;
1720 insz = 2;
1721 outsz = SIZEOF_WCHAR_T * 2;
1722 wbufPtr = wbuf;
1723 bufPtr = buf;
1724
1725 res = iconv(
1726 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1727 (char**)&wbufPtr, &outsz);
1728
1729 if (ICONV_FAILED(res, insz))
1730 {
1731 wxLogLastError(wxT("iconv"));
1732 wxLogError(_("Conversion to charset '%s' doesn't work."),
1733 nameCS.c_str());
1734 }
1735 else // ok, can convert to this encoding, remember it
1736 {
1737 ms_wcCharsetName = nameCS;
1738 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1739 }
1740 }
1741 }
1742 else // use charset not requiring byte swapping
1743 {
1744 ms_wcCharsetName = nameXE;
1745 }
1746 }
1747
1748 wxLogTrace(TRACE_STRCONV,
1749 wxT("iconv wchar_t charset is \"%s\"%s"),
1750 ms_wcCharsetName.empty() ? _T("<none>")
1751 : ms_wcCharsetName.c_str(),
1752 ms_wcNeedsSwap ? _T(" (needs swap)")
1753 : _T(""));
1754 }
1755 else // we already have ms_wcCharsetName
1756 {
1757 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1758 }
1759
1760 if ( ms_wcCharsetName.empty() )
1761 {
1762 w2m = ICONV_T_INVALID;
1763 }
1764 else
1765 {
1766 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1767 if ( w2m == ICONV_T_INVALID )
1768 {
1769 wxLogTrace(TRACE_STRCONV,
1770 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1771 ms_wcCharsetName.c_str(), cname.data());
1772 }
1773 }
1774 }
1775
1776 wxMBConv_iconv::~wxMBConv_iconv()
1777 {
1778 if ( m2w != ICONV_T_INVALID )
1779 iconv_close(m2w);
1780 if ( w2m != ICONV_T_INVALID )
1781 iconv_close(w2m);
1782 }
1783
1784 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1785 {
1786 // find the string length: notice that must be done differently for
1787 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1788 size_t inbuf;
1789 const size_t nulLen = GetMBNulLen();
1790 switch ( nulLen )
1791 {
1792 default:
1793 return wxCONV_FAILED;
1794
1795 case 1:
1796 inbuf = strlen(psz); // arguably more optimized than our version
1797 break;
1798
1799 case 2:
1800 case 4:
1801 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1802 // they also have to start at character boundary and not span two
1803 // adjacent characters
1804 const char *p;
1805 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1806 ;
1807 inbuf = p - psz;
1808 break;
1809 }
1810
1811 #if wxUSE_THREADS
1812 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1813 // Unfortunately there is a couple of global wxCSConv objects such as
1814 // wxConvLocal that are used all over wx code, so we have to make sure
1815 // the handle is used by at most one thread at the time. Otherwise
1816 // only a few wx classes would be safe to use from non-main threads
1817 // as MB<->WC conversion would fail "randomly".
1818 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1819 #endif // wxUSE_THREADS
1820
1821 size_t outbuf = n * SIZEOF_WCHAR_T;
1822 size_t res, cres;
1823 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1824 wchar_t *bufPtr = buf;
1825 const char *pszPtr = psz;
1826
1827 if (buf)
1828 {
1829 // have destination buffer, convert there
1830 cres = iconv(m2w,
1831 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1832 (char**)&bufPtr, &outbuf);
1833 res = n - (outbuf / SIZEOF_WCHAR_T);
1834
1835 if (ms_wcNeedsSwap)
1836 {
1837 // convert to native endianness
1838 for ( unsigned i = 0; i < res; i++ )
1839 buf[n] = WC_BSWAP(buf[i]);
1840 }
1841
1842 // NUL-terminate the string if there is any space left
1843 if (res < n)
1844 buf[res] = 0;
1845 }
1846 else
1847 {
1848 // no destination buffer... convert using temp buffer
1849 // to calculate destination buffer requirement
1850 wchar_t tbuf[8];
1851 res = 0;
1852
1853 do
1854 {
1855 bufPtr = tbuf;
1856 outbuf = 8 * SIZEOF_WCHAR_T;
1857
1858 cres = iconv(m2w,
1859 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1860 (char**)&bufPtr, &outbuf );
1861
1862 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1863 }
1864 while ((cres == (size_t)-1) && (errno == E2BIG));
1865 }
1866
1867 if (ICONV_FAILED(cres, inbuf))
1868 {
1869 //VS: it is ok if iconv fails, hence trace only
1870 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1871 return wxCONV_FAILED;
1872 }
1873
1874 return res;
1875 }
1876
1877 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1878 {
1879 #if wxUSE_THREADS
1880 // NB: explained in MB2WC
1881 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1882 #endif
1883
1884 size_t inlen = wxWcslen(psz);
1885 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1886 size_t outbuf = n;
1887 size_t res, cres;
1888
1889 wchar_t *tmpbuf = 0;
1890
1891 if (ms_wcNeedsSwap)
1892 {
1893 // need to copy to temp buffer to switch endianness
1894 // (doing WC_BSWAP twice on the original buffer won't help, as it
1895 // could be in read-only memory, or be accessed in some other thread)
1896 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1897 for ( size_t i = 0; i < inlen; i++ )
1898 tmpbuf[n] = WC_BSWAP(psz[i]);
1899
1900 tmpbuf[inlen] = L'\0';
1901 psz = tmpbuf;
1902 }
1903
1904 if (buf)
1905 {
1906 // have destination buffer, convert there
1907 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1908
1909 res = n - outbuf;
1910
1911 // NB: iconv was given only wcslen(psz) characters on input, and so
1912 // it couldn't convert the trailing zero. Let's do it ourselves
1913 // if there's some room left for it in the output buffer.
1914 if (res < n)
1915 buf[0] = 0;
1916 }
1917 else
1918 {
1919 // no destination buffer: convert using temp buffer
1920 // to calculate destination buffer requirement
1921 char tbuf[16];
1922 res = 0;
1923 do
1924 {
1925 buf = tbuf;
1926 outbuf = 16;
1927
1928 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1929
1930 res += 16 - outbuf;
1931 }
1932 while ((cres == (size_t)-1) && (errno == E2BIG));
1933 }
1934
1935 if (ms_wcNeedsSwap)
1936 {
1937 free(tmpbuf);
1938 }
1939
1940 if (ICONV_FAILED(cres, inbuf))
1941 {
1942 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1943 return wxCONV_FAILED;
1944 }
1945
1946 return res;
1947 }
1948
1949 size_t wxMBConv_iconv::GetMBNulLen() const
1950 {
1951 if ( m_minMBCharWidth == 0 )
1952 {
1953 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1954
1955 #if wxUSE_THREADS
1956 // NB: explained in MB2WC
1957 wxMutexLocker lock(self->m_iconvMutex);
1958 #endif
1959
1960 wchar_t *wnul = L"";
1961 char buf[8]; // should be enough for NUL in any encoding
1962 size_t inLen = sizeof(wchar_t),
1963 outLen = WXSIZEOF(buf);
1964 char *inBuff = (char *)wnul;
1965 char *outBuff = buf;
1966 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1967 {
1968 self->m_minMBCharWidth = (size_t)-1;
1969 }
1970 else // ok
1971 {
1972 self->m_minMBCharWidth = outBuff - buf;
1973 }
1974 }
1975
1976 return m_minMBCharWidth;
1977 }
1978
1979 #endif // HAVE_ICONV
1980
1981
1982 // ============================================================================
1983 // Win32 conversion classes
1984 // ============================================================================
1985
1986 #ifdef wxHAVE_WIN32_MB2WC
1987
1988 // from utils.cpp
1989 #if wxUSE_FONTMAP
1990 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1991 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1992 #endif
1993
1994 class wxMBConv_win32 : public wxMBConv
1995 {
1996 public:
1997 wxMBConv_win32()
1998 {
1999 m_CodePage = CP_ACP;
2000 m_minMBCharWidth = 0;
2001 }
2002
2003 wxMBConv_win32(const wxMBConv_win32& conv)
2004 {
2005 m_CodePage = conv.m_CodePage;
2006 m_minMBCharWidth = conv.m_minMBCharWidth;
2007 }
2008
2009 #if wxUSE_FONTMAP
2010 wxMBConv_win32(const wxChar* name)
2011 {
2012 m_CodePage = wxCharsetToCodepage(name);
2013 m_minMBCharWidth = 0;
2014 }
2015
2016 wxMBConv_win32(wxFontEncoding encoding)
2017 {
2018 m_CodePage = wxEncodingToCodepage(encoding);
2019 m_minMBCharWidth = 0;
2020 }
2021 #endif // wxUSE_FONTMAP
2022
2023 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2024 {
2025 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2026 // the behaviour is not compatible with the Unix version (using iconv)
2027 // and break the library itself, e.g. wxTextInputStream::NextChar()
2028 // wouldn't work if reading an incomplete MB char didn't result in an
2029 // error
2030 //
2031 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2032 // Win XP or newer and it is not supported for UTF-[78] so we always
2033 // use our own conversions in this case. See
2034 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2035 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2036 if ( m_CodePage == CP_UTF8 )
2037 {
2038 return wxConvUTF8.MB2WC(buf, psz, n);
2039 }
2040
2041 if ( m_CodePage == CP_UTF7 )
2042 {
2043 return wxConvUTF7.MB2WC(buf, psz, n);
2044 }
2045
2046 int flags = 0;
2047 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2048 IsAtLeastWin2kSP4() )
2049 {
2050 flags = MB_ERR_INVALID_CHARS;
2051 }
2052
2053 const size_t len = ::MultiByteToWideChar
2054 (
2055 m_CodePage, // code page
2056 flags, // flags: fall on error
2057 psz, // input string
2058 -1, // its length (NUL-terminated)
2059 buf, // output string
2060 buf ? n : 0 // size of output buffer
2061 );
2062 if ( !len )
2063 {
2064 // function totally failed
2065 return wxCONV_FAILED;
2066 }
2067
2068 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2069 // check if we succeeded, by doing a double trip:
2070 if ( !flags && buf )
2071 {
2072 const size_t mbLen = strlen(psz);
2073 wxCharBuffer mbBuf(mbLen);
2074 if ( ::WideCharToMultiByte
2075 (
2076 m_CodePage,
2077 0,
2078 buf,
2079 -1,
2080 mbBuf.data(),
2081 mbLen + 1, // size in bytes, not length
2082 NULL,
2083 NULL
2084 ) == 0 ||
2085 strcmp(mbBuf, psz) != 0 )
2086 {
2087 // we didn't obtain the same thing we started from, hence
2088 // the conversion was lossy and we consider that it failed
2089 return wxCONV_FAILED;
2090 }
2091 }
2092
2093 // note that it returns count of written chars for buf != NULL and size
2094 // of the needed buffer for buf == NULL so in either case the length of
2095 // the string (which never includes the terminating NUL) is one less
2096 return len - 1;
2097 }
2098
2099 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2100 {
2101 /*
2102 we have a problem here: by default, WideCharToMultiByte() may
2103 replace characters unrepresentable in the target code page with bad
2104 quality approximations such as turning "1/2" symbol (U+00BD) into
2105 "1" for the code pages which don't have it and we, obviously, want
2106 to avoid this at any price
2107
2108 the trouble is that this function does it _silently_, i.e. it won't
2109 even tell us whether it did or not... Win98/2000 and higher provide
2110 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2111 we have to resort to a round trip, i.e. check that converting back
2112 results in the same string -- this is, of course, expensive but
2113 otherwise we simply can't be sure to not garble the data.
2114 */
2115
2116 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2117 // it doesn't work with CJK encodings (which we test for rather roughly
2118 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2119 // supporting it
2120 BOOL usedDef wxDUMMY_INITIALIZE(false);
2121 BOOL *pUsedDef;
2122 int flags;
2123 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2124 {
2125 // it's our lucky day
2126 flags = WC_NO_BEST_FIT_CHARS;
2127 pUsedDef = &usedDef;
2128 }
2129 else // old system or unsupported encoding
2130 {
2131 flags = 0;
2132 pUsedDef = NULL;
2133 }
2134
2135 const size_t len = ::WideCharToMultiByte
2136 (
2137 m_CodePage, // code page
2138 flags, // either none or no best fit
2139 pwz, // input string
2140 -1, // it is (wide) NUL-terminated
2141 buf, // output buffer
2142 buf ? n : 0, // and its size
2143 NULL, // default "replacement" char
2144 pUsedDef // [out] was it used?
2145 );
2146
2147 if ( !len )
2148 {
2149 // function totally failed
2150 return wxCONV_FAILED;
2151 }
2152
2153 // if we were really converting, check if we succeeded
2154 if ( buf )
2155 {
2156 if ( flags )
2157 {
2158 // check if the conversion failed, i.e. if any replacements
2159 // were done
2160 if ( usedDef )
2161 return wxCONV_FAILED;
2162 }
2163 else // we must resort to double tripping...
2164 {
2165 wxWCharBuffer wcBuf(n);
2166 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2167 wcscmp(wcBuf, pwz) != 0 )
2168 {
2169 // we didn't obtain the same thing we started from, hence
2170 // the conversion was lossy and we consider that it failed
2171 return wxCONV_FAILED;
2172 }
2173 }
2174 }
2175
2176 // see the comment above for the reason of "len - 1"
2177 return len - 1;
2178 }
2179
2180 virtual size_t GetMBNulLen() const
2181 {
2182 if ( m_minMBCharWidth == 0 )
2183 {
2184 int len = ::WideCharToMultiByte
2185 (
2186 m_CodePage, // code page
2187 0, // no flags
2188 L"", // input string
2189 1, // translate just the NUL
2190 NULL, // output buffer
2191 0, // and its size
2192 NULL, // no replacement char
2193 NULL // [out] don't care if it was used
2194 );
2195
2196 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2197 switch ( len )
2198 {
2199 default:
2200 wxLogDebug(_T("Unexpected NUL length %d"), len);
2201 self->m_minMBCharWidth = (size_t)-1;
2202 break;
2203
2204 case 0:
2205 self->m_minMBCharWidth = (size_t)-1;
2206 break;
2207
2208 case 1:
2209 case 2:
2210 case 4:
2211 self->m_minMBCharWidth = len;
2212 break;
2213 }
2214 }
2215
2216 return m_minMBCharWidth;
2217 }
2218
2219 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2220
2221 bool IsOk() const { return m_CodePage != -1; }
2222
2223 private:
2224 static bool CanUseNoBestFit()
2225 {
2226 static int s_isWin98Or2k = -1;
2227
2228 if ( s_isWin98Or2k == -1 )
2229 {
2230 int verMaj, verMin;
2231 switch ( wxGetOsVersion(&verMaj, &verMin) )
2232 {
2233 case wxWIN95:
2234 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2235 break;
2236
2237 case wxWINDOWS_NT:
2238 s_isWin98Or2k = verMaj >= 5;
2239 break;
2240
2241 default:
2242 // unknown: be conservative by default
2243 s_isWin98Or2k = 0;
2244 break;
2245 }
2246
2247 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2248 }
2249
2250 return s_isWin98Or2k == 1;
2251 }
2252
2253 static bool IsAtLeastWin2kSP4()
2254 {
2255 #ifdef __WXWINCE__
2256 return false;
2257 #else
2258 static int s_isAtLeastWin2kSP4 = -1;
2259
2260 if ( s_isAtLeastWin2kSP4 == -1 )
2261 {
2262 OSVERSIONINFOEX ver;
2263
2264 memset(&ver, 0, sizeof(ver));
2265 ver.dwOSVersionInfoSize = sizeof(ver);
2266 GetVersionEx((OSVERSIONINFO*)&ver);
2267
2268 s_isAtLeastWin2kSP4 =
2269 ((ver.dwMajorVersion > 5) || // Vista+
2270 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2271 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2272 ver.wServicePackMajor >= 4)) // 2000 SP4+
2273 ? 1 : 0;
2274 }
2275
2276 return s_isAtLeastWin2kSP4 == 1;
2277 #endif
2278 }
2279
2280
2281 // the code page we're working with
2282 long m_CodePage;
2283
2284 // cached result of GetMBNulLen(), set to 0 initially meaning
2285 // "unknown"
2286 size_t m_minMBCharWidth;
2287 };
2288
2289 #endif // wxHAVE_WIN32_MB2WC
2290
2291 // ============================================================================
2292 // Cocoa conversion classes
2293 // ============================================================================
2294
2295 #if defined(__WXCOCOA__)
2296
2297 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2298 // Strangely enough, internally Core Foundation uses
2299 // UTF-32 internally quite a bit - its just not public (yet).
2300
2301 #include <CoreFoundation/CFString.h>
2302 #include <CoreFoundation/CFStringEncodingExt.h>
2303
2304 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2305 {
2306 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2307
2308 switch (encoding)
2309 {
2310 case wxFONTENCODING_DEFAULT :
2311 enc = CFStringGetSystemEncoding();
2312 break ;
2313
2314 case wxFONTENCODING_ISO8859_1 :
2315 enc = kCFStringEncodingISOLatin1 ;
2316 break ;
2317 case wxFONTENCODING_ISO8859_2 :
2318 enc = kCFStringEncodingISOLatin2;
2319 break ;
2320 case wxFONTENCODING_ISO8859_3 :
2321 enc = kCFStringEncodingISOLatin3 ;
2322 break ;
2323 case wxFONTENCODING_ISO8859_4 :
2324 enc = kCFStringEncodingISOLatin4;
2325 break ;
2326 case wxFONTENCODING_ISO8859_5 :
2327 enc = kCFStringEncodingISOLatinCyrillic;
2328 break ;
2329 case wxFONTENCODING_ISO8859_6 :
2330 enc = kCFStringEncodingISOLatinArabic;
2331 break ;
2332 case wxFONTENCODING_ISO8859_7 :
2333 enc = kCFStringEncodingISOLatinGreek;
2334 break ;
2335 case wxFONTENCODING_ISO8859_8 :
2336 enc = kCFStringEncodingISOLatinHebrew;
2337 break ;
2338 case wxFONTENCODING_ISO8859_9 :
2339 enc = kCFStringEncodingISOLatin5;
2340 break ;
2341 case wxFONTENCODING_ISO8859_10 :
2342 enc = kCFStringEncodingISOLatin6;
2343 break ;
2344 case wxFONTENCODING_ISO8859_11 :
2345 enc = kCFStringEncodingISOLatinThai;
2346 break ;
2347 case wxFONTENCODING_ISO8859_13 :
2348 enc = kCFStringEncodingISOLatin7;
2349 break ;
2350 case wxFONTENCODING_ISO8859_14 :
2351 enc = kCFStringEncodingISOLatin8;
2352 break ;
2353 case wxFONTENCODING_ISO8859_15 :
2354 enc = kCFStringEncodingISOLatin9;
2355 break ;
2356
2357 case wxFONTENCODING_KOI8 :
2358 enc = kCFStringEncodingKOI8_R;
2359 break ;
2360 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2361 enc = kCFStringEncodingDOSRussian;
2362 break ;
2363
2364 // case wxFONTENCODING_BULGARIAN :
2365 // enc = ;
2366 // break ;
2367
2368 case wxFONTENCODING_CP437 :
2369 enc = kCFStringEncodingDOSLatinUS ;
2370 break ;
2371 case wxFONTENCODING_CP850 :
2372 enc = kCFStringEncodingDOSLatin1;
2373 break ;
2374 case wxFONTENCODING_CP852 :
2375 enc = kCFStringEncodingDOSLatin2;
2376 break ;
2377 case wxFONTENCODING_CP855 :
2378 enc = kCFStringEncodingDOSCyrillic;
2379 break ;
2380 case wxFONTENCODING_CP866 :
2381 enc = kCFStringEncodingDOSRussian ;
2382 break ;
2383 case wxFONTENCODING_CP874 :
2384 enc = kCFStringEncodingDOSThai;
2385 break ;
2386 case wxFONTENCODING_CP932 :
2387 enc = kCFStringEncodingDOSJapanese;
2388 break ;
2389 case wxFONTENCODING_CP936 :
2390 enc = kCFStringEncodingDOSChineseSimplif ;
2391 break ;
2392 case wxFONTENCODING_CP949 :
2393 enc = kCFStringEncodingDOSKorean;
2394 break ;
2395 case wxFONTENCODING_CP950 :
2396 enc = kCFStringEncodingDOSChineseTrad;
2397 break ;
2398 case wxFONTENCODING_CP1250 :
2399 enc = kCFStringEncodingWindowsLatin2;
2400 break ;
2401 case wxFONTENCODING_CP1251 :
2402 enc = kCFStringEncodingWindowsCyrillic ;
2403 break ;
2404 case wxFONTENCODING_CP1252 :
2405 enc = kCFStringEncodingWindowsLatin1 ;
2406 break ;
2407 case wxFONTENCODING_CP1253 :
2408 enc = kCFStringEncodingWindowsGreek;
2409 break ;
2410 case wxFONTENCODING_CP1254 :
2411 enc = kCFStringEncodingWindowsLatin5;
2412 break ;
2413 case wxFONTENCODING_CP1255 :
2414 enc = kCFStringEncodingWindowsHebrew ;
2415 break ;
2416 case wxFONTENCODING_CP1256 :
2417 enc = kCFStringEncodingWindowsArabic ;
2418 break ;
2419 case wxFONTENCODING_CP1257 :
2420 enc = kCFStringEncodingWindowsBalticRim;
2421 break ;
2422 // This only really encodes to UTF7 (if that) evidently
2423 // case wxFONTENCODING_UTF7 :
2424 // enc = kCFStringEncodingNonLossyASCII ;
2425 // break ;
2426 case wxFONTENCODING_UTF8 :
2427 enc = kCFStringEncodingUTF8 ;
2428 break ;
2429 case wxFONTENCODING_EUC_JP :
2430 enc = kCFStringEncodingEUC_JP;
2431 break ;
2432 case wxFONTENCODING_UTF16 :
2433 enc = kCFStringEncodingUnicode ;
2434 break ;
2435 case wxFONTENCODING_MACROMAN :
2436 enc = kCFStringEncodingMacRoman ;
2437 break ;
2438 case wxFONTENCODING_MACJAPANESE :
2439 enc = kCFStringEncodingMacJapanese ;
2440 break ;
2441 case wxFONTENCODING_MACCHINESETRAD :
2442 enc = kCFStringEncodingMacChineseTrad ;
2443 break ;
2444 case wxFONTENCODING_MACKOREAN :
2445 enc = kCFStringEncodingMacKorean ;
2446 break ;
2447 case wxFONTENCODING_MACARABIC :
2448 enc = kCFStringEncodingMacArabic ;
2449 break ;
2450 case wxFONTENCODING_MACHEBREW :
2451 enc = kCFStringEncodingMacHebrew ;
2452 break ;
2453 case wxFONTENCODING_MACGREEK :
2454 enc = kCFStringEncodingMacGreek ;
2455 break ;
2456 case wxFONTENCODING_MACCYRILLIC :
2457 enc = kCFStringEncodingMacCyrillic ;
2458 break ;
2459 case wxFONTENCODING_MACDEVANAGARI :
2460 enc = kCFStringEncodingMacDevanagari ;
2461 break ;
2462 case wxFONTENCODING_MACGURMUKHI :
2463 enc = kCFStringEncodingMacGurmukhi ;
2464 break ;
2465 case wxFONTENCODING_MACGUJARATI :
2466 enc = kCFStringEncodingMacGujarati ;
2467 break ;
2468 case wxFONTENCODING_MACORIYA :
2469 enc = kCFStringEncodingMacOriya ;
2470 break ;
2471 case wxFONTENCODING_MACBENGALI :
2472 enc = kCFStringEncodingMacBengali ;
2473 break ;
2474 case wxFONTENCODING_MACTAMIL :
2475 enc = kCFStringEncodingMacTamil ;
2476 break ;
2477 case wxFONTENCODING_MACTELUGU :
2478 enc = kCFStringEncodingMacTelugu ;
2479 break ;
2480 case wxFONTENCODING_MACKANNADA :
2481 enc = kCFStringEncodingMacKannada ;
2482 break ;
2483 case wxFONTENCODING_MACMALAJALAM :
2484 enc = kCFStringEncodingMacMalayalam ;
2485 break ;
2486 case wxFONTENCODING_MACSINHALESE :
2487 enc = kCFStringEncodingMacSinhalese ;
2488 break ;
2489 case wxFONTENCODING_MACBURMESE :
2490 enc = kCFStringEncodingMacBurmese ;
2491 break ;
2492 case wxFONTENCODING_MACKHMER :
2493 enc = kCFStringEncodingMacKhmer ;
2494 break ;
2495 case wxFONTENCODING_MACTHAI :
2496 enc = kCFStringEncodingMacThai ;
2497 break ;
2498 case wxFONTENCODING_MACLAOTIAN :
2499 enc = kCFStringEncodingMacLaotian ;
2500 break ;
2501 case wxFONTENCODING_MACGEORGIAN :
2502 enc = kCFStringEncodingMacGeorgian ;
2503 break ;
2504 case wxFONTENCODING_MACARMENIAN :
2505 enc = kCFStringEncodingMacArmenian ;
2506 break ;
2507 case wxFONTENCODING_MACCHINESESIMP :
2508 enc = kCFStringEncodingMacChineseSimp ;
2509 break ;
2510 case wxFONTENCODING_MACTIBETAN :
2511 enc = kCFStringEncodingMacTibetan ;
2512 break ;
2513 case wxFONTENCODING_MACMONGOLIAN :
2514 enc = kCFStringEncodingMacMongolian ;
2515 break ;
2516 case wxFONTENCODING_MACETHIOPIC :
2517 enc = kCFStringEncodingMacEthiopic ;
2518 break ;
2519 case wxFONTENCODING_MACCENTRALEUR :
2520 enc = kCFStringEncodingMacCentralEurRoman ;
2521 break ;
2522 case wxFONTENCODING_MACVIATNAMESE :
2523 enc = kCFStringEncodingMacVietnamese ;
2524 break ;
2525 case wxFONTENCODING_MACARABICEXT :
2526 enc = kCFStringEncodingMacExtArabic ;
2527 break ;
2528 case wxFONTENCODING_MACSYMBOL :
2529 enc = kCFStringEncodingMacSymbol ;
2530 break ;
2531 case wxFONTENCODING_MACDINGBATS :
2532 enc = kCFStringEncodingMacDingbats ;
2533 break ;
2534 case wxFONTENCODING_MACTURKISH :
2535 enc = kCFStringEncodingMacTurkish ;
2536 break ;
2537 case wxFONTENCODING_MACCROATIAN :
2538 enc = kCFStringEncodingMacCroatian ;
2539 break ;
2540 case wxFONTENCODING_MACICELANDIC :
2541 enc = kCFStringEncodingMacIcelandic ;
2542 break ;
2543 case wxFONTENCODING_MACROMANIAN :
2544 enc = kCFStringEncodingMacRomanian ;
2545 break ;
2546 case wxFONTENCODING_MACCELTIC :
2547 enc = kCFStringEncodingMacCeltic ;
2548 break ;
2549 case wxFONTENCODING_MACGAELIC :
2550 enc = kCFStringEncodingMacGaelic ;
2551 break ;
2552 // case wxFONTENCODING_MACKEYBOARD :
2553 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2554 // break ;
2555
2556 default :
2557 // because gcc is picky
2558 break ;
2559 }
2560
2561 return enc ;
2562 }
2563
2564 class wxMBConv_cocoa : public wxMBConv
2565 {
2566 public:
2567 wxMBConv_cocoa()
2568 {
2569 Init(CFStringGetSystemEncoding()) ;
2570 }
2571
2572 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2573 {
2574 m_encoding = conv.m_encoding;
2575 }
2576
2577 #if wxUSE_FONTMAP
2578 wxMBConv_cocoa(const wxChar* name)
2579 {
2580 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2581 }
2582 #endif
2583
2584 wxMBConv_cocoa(wxFontEncoding encoding)
2585 {
2586 Init( wxCFStringEncFromFontEnc(encoding) );
2587 }
2588
2589 ~wxMBConv_cocoa()
2590 {
2591 }
2592
2593 void Init( CFStringEncoding encoding)
2594 {
2595 m_encoding = encoding ;
2596 }
2597
2598 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2599 {
2600 wxASSERT(szUnConv);
2601
2602 CFStringRef theString = CFStringCreateWithBytes (
2603 NULL, //the allocator
2604 (const UInt8*)szUnConv,
2605 strlen(szUnConv),
2606 m_encoding,
2607 false //no BOM/external representation
2608 );
2609
2610 wxASSERT(theString);
2611
2612 size_t nOutLength = CFStringGetLength(theString);
2613
2614 if (szOut == NULL)
2615 {
2616 CFRelease(theString);
2617 return nOutLength;
2618 }
2619
2620 CFRange theRange = { 0, nOutSize };
2621
2622 #if SIZEOF_WCHAR_T == 4
2623 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2624 #endif
2625
2626 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2627
2628 CFRelease(theString);
2629
2630 szUniCharBuffer[nOutLength] = '\0';
2631
2632 #if SIZEOF_WCHAR_T == 4
2633 wxMBConvUTF16 converter;
2634 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2635 delete [] szUniCharBuffer;
2636 #endif
2637
2638 return nOutLength;
2639 }
2640
2641 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2642 {
2643 wxASSERT(szUnConv);
2644
2645 size_t nRealOutSize;
2646 size_t nBufSize = wxWcslen(szUnConv);
2647 UniChar* szUniBuffer = (UniChar*) szUnConv;
2648
2649 #if SIZEOF_WCHAR_T == 4
2650 wxMBConvUTF16 converter ;
2651 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2652 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2653 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2654 nBufSize /= sizeof(UniChar);
2655 #endif
2656
2657 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2658 NULL, //allocator
2659 szUniBuffer,
2660 nBufSize,
2661 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2662 );
2663
2664 wxASSERT(theString);
2665
2666 //Note that CER puts a BOM when converting to unicode
2667 //so we check and use getchars instead in that case
2668 if (m_encoding == kCFStringEncodingUnicode)
2669 {
2670 if (szOut != NULL)
2671 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2672
2673 nRealOutSize = CFStringGetLength(theString) + 1;
2674 }
2675 else
2676 {
2677 CFStringGetBytes(
2678 theString,
2679 CFRangeMake(0, CFStringGetLength(theString)),
2680 m_encoding,
2681 0, //what to put in characters that can't be converted -
2682 //0 tells CFString to return NULL if it meets such a character
2683 false, //not an external representation
2684 (UInt8*) szOut,
2685 nOutSize,
2686 (CFIndex*) &nRealOutSize
2687 );
2688 }
2689
2690 CFRelease(theString);
2691
2692 #if SIZEOF_WCHAR_T == 4
2693 delete[] szUniBuffer;
2694 #endif
2695
2696 return nRealOutSize - 1;
2697 }
2698
2699 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2700
2701 bool IsOk() const
2702 {
2703 return m_encoding != kCFStringEncodingInvalidId &&
2704 CFStringIsEncodingAvailable(m_encoding);
2705 }
2706
2707 private:
2708 CFStringEncoding m_encoding ;
2709 };
2710
2711 #endif // defined(__WXCOCOA__)
2712
2713 // ============================================================================
2714 // Mac conversion classes
2715 // ============================================================================
2716
2717 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2718
2719 class wxMBConv_mac : public wxMBConv
2720 {
2721 public:
2722 wxMBConv_mac()
2723 {
2724 Init(CFStringGetSystemEncoding()) ;
2725 }
2726
2727 wxMBConv_mac(const wxMBConv_mac& conv)
2728 {
2729 Init(conv.m_char_encoding);
2730 }
2731
2732 #if wxUSE_FONTMAP
2733 wxMBConv_mac(const wxChar* name)
2734 {
2735 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2736 }
2737 #endif
2738
2739 wxMBConv_mac(wxFontEncoding encoding)
2740 {
2741 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2742 }
2743
2744 ~wxMBConv_mac()
2745 {
2746 OSStatus status = noErr ;
2747 status = TECDisposeConverter(m_MB2WC_converter);
2748 status = TECDisposeConverter(m_WC2MB_converter);
2749 }
2750
2751
2752 void Init( TextEncodingBase encoding)
2753 {
2754 OSStatus status = noErr ;
2755 m_char_encoding = encoding ;
2756 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2757
2758 status = TECCreateConverter(&m_MB2WC_converter,
2759 m_char_encoding,
2760 m_unicode_encoding);
2761 status = TECCreateConverter(&m_WC2MB_converter,
2762 m_unicode_encoding,
2763 m_char_encoding);
2764 }
2765
2766 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2767 {
2768 OSStatus status = noErr ;
2769 ByteCount byteOutLen ;
2770 ByteCount byteInLen = strlen(psz) + 1;
2771 wchar_t *tbuf = NULL ;
2772 UniChar* ubuf = NULL ;
2773 size_t res = 0 ;
2774
2775 if (buf == NULL)
2776 {
2777 // Apple specs say at least 32
2778 n = wxMax( 32, byteInLen ) ;
2779 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2780 }
2781
2782 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2783
2784 #if SIZEOF_WCHAR_T == 4
2785 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2786 #else
2787 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2788 #endif
2789
2790 status = TECConvertText(
2791 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2792 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2793
2794 #if SIZEOF_WCHAR_T == 4
2795 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2796 // is not properly terminated we get random characters at the end
2797 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2798 wxMBConvUTF16 converter ;
2799 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2800 free( ubuf ) ;
2801 #else
2802 res = byteOutLen / sizeof( UniChar ) ;
2803 #endif
2804
2805 if ( buf == NULL )
2806 free(tbuf) ;
2807
2808 if ( buf && res < n)
2809 buf[res] = 0;
2810
2811 return res ;
2812 }
2813
2814 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2815 {
2816 OSStatus status = noErr ;
2817 ByteCount byteOutLen ;
2818 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2819
2820 char *tbuf = NULL ;
2821
2822 if (buf == NULL)
2823 {
2824 // Apple specs say at least 32
2825 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2826 tbuf = (char*) malloc( n ) ;
2827 }
2828
2829 ByteCount byteBufferLen = n ;
2830 UniChar* ubuf = NULL ;
2831
2832 #if SIZEOF_WCHAR_T == 4
2833 wxMBConvUTF16 converter ;
2834 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2835 byteInLen = unicharlen ;
2836 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2837 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2838 #else
2839 ubuf = (UniChar*) psz ;
2840 #endif
2841
2842 status = TECConvertText(
2843 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2844 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2845
2846 #if SIZEOF_WCHAR_T == 4
2847 free( ubuf ) ;
2848 #endif
2849
2850 if ( buf == NULL )
2851 free(tbuf) ;
2852
2853 size_t res = byteOutLen ;
2854 if ( buf && res < n)
2855 {
2856 buf[res] = 0;
2857
2858 //we need to double-trip to verify it didn't insert any ? in place
2859 //of bogus characters
2860 wxWCharBuffer wcBuf(n);
2861 size_t pszlen = wxWcslen(psz);
2862 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2863 wxWcslen(wcBuf) != pszlen ||
2864 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2865 {
2866 // we didn't obtain the same thing we started from, hence
2867 // the conversion was lossy and we consider that it failed
2868 return wxCONV_FAILED;
2869 }
2870 }
2871
2872 return res ;
2873 }
2874
2875 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2876
2877 bool IsOk() const
2878 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2879
2880 private:
2881 TECObjectRef m_MB2WC_converter;
2882 TECObjectRef m_WC2MB_converter;
2883
2884 TextEncodingBase m_char_encoding;
2885 TextEncodingBase m_unicode_encoding;
2886 };
2887
2888 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2889
2890 // ============================================================================
2891 // wxEncodingConverter based conversion classes
2892 // ============================================================================
2893
2894 #if wxUSE_FONTMAP
2895
2896 class wxMBConv_wxwin : public wxMBConv
2897 {
2898 private:
2899 void Init()
2900 {
2901 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2902 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2903 }
2904
2905 public:
2906 // temporarily just use wxEncodingConverter stuff,
2907 // so that it works while a better implementation is built
2908 wxMBConv_wxwin(const wxChar* name)
2909 {
2910 if (name)
2911 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2912 else
2913 m_enc = wxFONTENCODING_SYSTEM;
2914
2915 Init();
2916 }
2917
2918 wxMBConv_wxwin(wxFontEncoding enc)
2919 {
2920 m_enc = enc;
2921
2922 Init();
2923 }
2924
2925 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2926 {
2927 size_t inbuf = strlen(psz);
2928 if (buf)
2929 {
2930 if (!m2w.Convert(psz, buf))
2931 return wxCONV_FAILED;
2932 }
2933 return inbuf;
2934 }
2935
2936 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2937 {
2938 const size_t inbuf = wxWcslen(psz);
2939 if (buf)
2940 {
2941 if (!w2m.Convert(psz, buf))
2942 return wxCONV_FAILED;
2943 }
2944
2945 return inbuf;
2946 }
2947
2948 virtual size_t GetMBNulLen() const
2949 {
2950 switch ( m_enc )
2951 {
2952 case wxFONTENCODING_UTF16BE:
2953 case wxFONTENCODING_UTF16LE:
2954 return 2;
2955
2956 case wxFONTENCODING_UTF32BE:
2957 case wxFONTENCODING_UTF32LE:
2958 return 4;
2959
2960 default:
2961 return 1;
2962 }
2963 }
2964
2965 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2966
2967 bool IsOk() const { return m_ok; }
2968
2969 public:
2970 wxFontEncoding m_enc;
2971 wxEncodingConverter m2w, w2m;
2972
2973 private:
2974 // were we initialized successfully?
2975 bool m_ok;
2976
2977 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2978 };
2979
2980 // make the constructors available for unit testing
2981 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2982 {
2983 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2984 if ( !result->IsOk() )
2985 {
2986 delete result;
2987 return 0;
2988 }
2989
2990 return result;
2991 }
2992
2993 #endif // wxUSE_FONTMAP
2994
2995 // ============================================================================
2996 // wxCSConv implementation
2997 // ============================================================================
2998
2999 void wxCSConv::Init()
3000 {
3001 m_name = NULL;
3002 m_convReal = NULL;
3003 m_deferred = true;
3004 }
3005
3006 wxCSConv::wxCSConv(const wxChar *charset)
3007 {
3008 Init();
3009
3010 if ( charset )
3011 {
3012 SetName(charset);
3013 }
3014
3015 #if wxUSE_FONTMAP
3016 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3017 #else
3018 m_encoding = wxFONTENCODING_SYSTEM;
3019 #endif
3020 }
3021
3022 wxCSConv::wxCSConv(wxFontEncoding encoding)
3023 {
3024 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3025 {
3026 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3027
3028 encoding = wxFONTENCODING_SYSTEM;
3029 }
3030
3031 Init();
3032
3033 m_encoding = encoding;
3034 }
3035
3036 wxCSConv::~wxCSConv()
3037 {
3038 Clear();
3039 }
3040
3041 wxCSConv::wxCSConv(const wxCSConv& conv)
3042 : wxMBConv()
3043 {
3044 Init();
3045
3046 SetName(conv.m_name);
3047 m_encoding = conv.m_encoding;
3048 }
3049
3050 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3051 {
3052 Clear();
3053
3054 SetName(conv.m_name);
3055 m_encoding = conv.m_encoding;
3056
3057 return *this;
3058 }
3059
3060 void wxCSConv::Clear()
3061 {
3062 free(m_name);
3063 delete m_convReal;
3064
3065 m_name = NULL;
3066 m_convReal = NULL;
3067 }
3068
3069 void wxCSConv::SetName(const wxChar *charset)
3070 {
3071 if (charset)
3072 {
3073 m_name = wxStrdup(charset);
3074 m_deferred = true;
3075 }
3076 }
3077
3078 #if wxUSE_FONTMAP
3079 #include "wx/hashmap.h"
3080
3081 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3082 wxEncodingNameCache );
3083
3084 static wxEncodingNameCache gs_nameCache;
3085 #endif
3086
3087 wxMBConv *wxCSConv::DoCreate() const
3088 {
3089 #if wxUSE_FONTMAP
3090 wxLogTrace(TRACE_STRCONV,
3091 wxT("creating conversion for %s"),
3092 (m_name ? m_name
3093 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3094 #endif // wxUSE_FONTMAP
3095
3096 // check for the special case of ASCII or ISO8859-1 charset: as we have
3097 // special knowledge of it anyhow, we don't need to create a special
3098 // conversion object
3099 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3100 m_encoding == wxFONTENCODING_DEFAULT )
3101 {
3102 // don't convert at all
3103 return NULL;
3104 }
3105
3106 // we trust OS to do conversion better than we can so try external
3107 // conversion methods first
3108 //
3109 // the full order is:
3110 // 1. OS conversion (iconv() under Unix or Win32 API)
3111 // 2. hard coded conversions for UTF
3112 // 3. wxEncodingConverter as fall back
3113
3114 // step (1)
3115 #ifdef HAVE_ICONV
3116 #if !wxUSE_FONTMAP
3117 if ( m_name )
3118 #endif // !wxUSE_FONTMAP
3119 {
3120 wxString name(m_name);
3121 wxFontEncoding encoding(m_encoding);
3122
3123 if ( !name.empty() )
3124 {
3125 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3126 if ( conv->IsOk() )
3127 return conv;
3128
3129 delete conv;
3130
3131 #if wxUSE_FONTMAP
3132 encoding =
3133 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3134 #endif // wxUSE_FONTMAP
3135 }
3136 #if wxUSE_FONTMAP
3137 {
3138 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3139 if ( it != gs_nameCache.end() )
3140 {
3141 if ( it->second.empty() )
3142 return NULL;
3143
3144 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3145 if ( conv->IsOk() )
3146 return conv;
3147
3148 delete conv;
3149 }
3150
3151 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3152
3153 for ( ; *names; ++names )
3154 {
3155 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3156 if ( conv->IsOk() )
3157 {
3158 gs_nameCache[encoding] = *names;
3159 return conv;
3160 }
3161
3162 delete conv;
3163 }
3164
3165 gs_nameCache[encoding] = _T(""); // cache the failure
3166 }
3167 #endif // wxUSE_FONTMAP
3168 }
3169 #endif // HAVE_ICONV
3170
3171 #ifdef wxHAVE_WIN32_MB2WC
3172 {
3173 #if wxUSE_FONTMAP
3174 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3175 : new wxMBConv_win32(m_encoding);
3176 if ( conv->IsOk() )
3177 return conv;
3178
3179 delete conv;
3180 #else
3181 return NULL;
3182 #endif
3183 }
3184 #endif // wxHAVE_WIN32_MB2WC
3185
3186 #if defined(__WXMAC__)
3187 {
3188 // leave UTF16 and UTF32 to the built-ins of wx
3189 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3190 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3191 {
3192 #if wxUSE_FONTMAP
3193 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3194 : new wxMBConv_mac(m_encoding);
3195 #else
3196 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3197 #endif
3198 if ( conv->IsOk() )
3199 return conv;
3200
3201 delete conv;
3202 }
3203 }
3204 #endif
3205
3206 #if defined(__WXCOCOA__)
3207 {
3208 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3209 {
3210 #if wxUSE_FONTMAP
3211 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3212 : new wxMBConv_cocoa(m_encoding);
3213 #else
3214 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3215 #endif
3216
3217 if ( conv->IsOk() )
3218 return conv;
3219
3220 delete conv;
3221 }
3222 }
3223 #endif
3224 // step (2)
3225 wxFontEncoding enc = m_encoding;
3226 #if wxUSE_FONTMAP
3227 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3228 {
3229 // use "false" to suppress interactive dialogs -- we can be called from
3230 // anywhere and popping up a dialog from here is the last thing we want to
3231 // do
3232 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3233 }
3234 #endif // wxUSE_FONTMAP
3235
3236 switch ( enc )
3237 {
3238 case wxFONTENCODING_UTF7:
3239 return new wxMBConvUTF7;
3240
3241 case wxFONTENCODING_UTF8:
3242 return new wxMBConvUTF8;
3243
3244 case wxFONTENCODING_UTF16BE:
3245 return new wxMBConvUTF16BE;
3246
3247 case wxFONTENCODING_UTF16LE:
3248 return new wxMBConvUTF16LE;
3249
3250 case wxFONTENCODING_UTF32BE:
3251 return new wxMBConvUTF32BE;
3252
3253 case wxFONTENCODING_UTF32LE:
3254 return new wxMBConvUTF32LE;
3255
3256 default:
3257 // nothing to do but put here to suppress gcc warnings
3258 break;
3259 }
3260
3261 // step (3)
3262 #if wxUSE_FONTMAP
3263 {
3264 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3265 : new wxMBConv_wxwin(m_encoding);
3266 if ( conv->IsOk() )
3267 return conv;
3268
3269 delete conv;
3270 }
3271 #endif // wxUSE_FONTMAP
3272
3273 // NB: This is a hack to prevent deadlock. What could otherwise happen
3274 // in Unicode build: wxConvLocal creation ends up being here
3275 // because of some failure and logs the error. But wxLog will try to
3276 // attach timestamp, for which it will need wxConvLocal (to convert
3277 // time to char* and then wchar_t*), but that fails, tries to log
3278 // error, but wxLog has a (already locked) critical section that
3279 // guards static buffer.
3280 static bool alreadyLoggingError = false;
3281 if (!alreadyLoggingError)
3282 {
3283 alreadyLoggingError = true;
3284 wxLogError(_("Cannot convert from the charset '%s'!"),
3285 m_name ? m_name
3286 :
3287 #if wxUSE_FONTMAP
3288 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3289 #else // !wxUSE_FONTMAP
3290 wxString::Format(_("encoding %s"), m_encoding).c_str()
3291 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3292 );
3293
3294 alreadyLoggingError = false;
3295 }
3296
3297 return NULL;
3298 }
3299
3300 void wxCSConv::CreateConvIfNeeded() const
3301 {
3302 if ( m_deferred )
3303 {
3304 wxCSConv *self = (wxCSConv *)this; // const_cast
3305
3306 #if wxUSE_INTL
3307 // if we don't have neither the name nor the encoding, use the default
3308 // encoding for this system
3309 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3310 {
3311 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3312 }
3313 #endif // wxUSE_INTL
3314
3315 self->m_convReal = DoCreate();
3316 self->m_deferred = false;
3317 }
3318 }
3319
3320 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3321 {
3322 CreateConvIfNeeded();
3323
3324 if (m_convReal)
3325 return m_convReal->MB2WC(buf, psz, n);
3326
3327 // latin-1 (direct)
3328 size_t len = strlen(psz);
3329
3330 if (buf)
3331 {
3332 for (size_t c = 0; c <= len; c++)
3333 buf[c] = (unsigned char)(psz[c]);
3334 }
3335
3336 return len;
3337 }
3338
3339 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3340 {
3341 CreateConvIfNeeded();
3342
3343 if (m_convReal)
3344 return m_convReal->WC2MB(buf, psz, n);
3345
3346 // latin-1 (direct)
3347 const size_t len = wxWcslen(psz);
3348 if (buf)
3349 {
3350 for (size_t c = 0; c <= len; c++)
3351 {
3352 if (psz[c] > 0xFF)
3353 return wxCONV_FAILED;
3354
3355 buf[c] = (char)psz[c];
3356 }
3357 }
3358 else
3359 {
3360 for (size_t c = 0; c <= len; c++)
3361 {
3362 if (psz[c] > 0xFF)
3363 return wxCONV_FAILED;
3364 }
3365 }
3366
3367 return len;
3368 }
3369
3370 size_t wxCSConv::GetMBNulLen() const
3371 {
3372 CreateConvIfNeeded();
3373
3374 if ( m_convReal )
3375 {
3376 return m_convReal->GetMBNulLen();
3377 }
3378
3379 return 1;
3380 }
3381
3382 // ----------------------------------------------------------------------------
3383 // globals
3384 // ----------------------------------------------------------------------------
3385
3386 #ifdef __WINDOWS__
3387 static wxMBConv_win32 wxConvLibcObj;
3388 #elif defined(__WXMAC__) && !defined(__MACH__)
3389 static wxMBConv_mac wxConvLibcObj ;
3390 #else
3391 static wxMBConvLibc wxConvLibcObj;
3392 #endif
3393
3394 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3395 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3396 static wxMBConvUTF7 wxConvUTF7Obj;
3397 static wxMBConvUTF8 wxConvUTF8Obj;
3398
3399 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3400 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3401 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3402 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3403 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3404 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3405 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3406 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3407 #ifdef __WXOSX__
3408 wxConvUTF8Obj;
3409 #else
3410 wxConvLibcObj;
3411 #endif
3412
3413 #else // !wxUSE_WCHAR_T
3414
3415 // stand-ins in absence of wchar_t
3416 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3417 wxConvISO8859_1,
3418 wxConvLocal,
3419 wxConvUTF8;
3420
3421 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T