]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
docopydocs is not needed for this script.
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
17
18 #ifndef WX_PRECOMP
19 #include "wx/intl.h"
20 #include "wx/log.h"
21 #include "wx/utils.h"
22 #endif
23
24 #include "wx/strconv.h"
25
26 #if wxUSE_WCHAR_T
27
28 #ifdef __WINDOWS__
29 #include "wx/msw/private.h"
30 #include "wx/msw/missing.h"
31 #endif
32
33 #ifndef __WXWINCE__
34 #include <errno.h>
35 #endif
36
37 #include <ctype.h>
38 #include <string.h>
39 #include <stdlib.h>
40
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #define wxHAVE_WIN32_MB2WC
43 #endif
44
45 #ifdef __SALFORDC__
46 #include <clib.h>
47 #endif
48
49 #ifdef HAVE_ICONV
50 #include <iconv.h>
51 #include "wx/thread.h"
52 #endif
53
54 #include "wx/encconv.h"
55 #include "wx/fontmap.h"
56
57 #ifdef __WXMAC__
58 #ifndef __DARWIN__
59 #include <ATSUnicode.h>
60 #include <TextCommon.h>
61 #include <TextEncodingConverter.h>
62 #endif
63
64 // includes Mac headers
65 #include "wx/mac/private.h"
66 #endif
67
68
69 #define TRACE_STRCONV _T("strconv")
70
71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
72 // be 4 bytes
73 #if SIZEOF_WCHAR_T == 2
74 #define WC_UTF16
75 #endif
76
77
78 // ============================================================================
79 // implementation
80 // ============================================================================
81
82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
83 static bool NotAllNULs(const char *p, size_t n)
84 {
85 while ( n && *p++ == '\0' )
86 n--;
87
88 return n != 0;
89 }
90
91 // ----------------------------------------------------------------------------
92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
93 // ----------------------------------------------------------------------------
94
95 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
96 {
97 if (input <= 0xffff)
98 {
99 if (output)
100 *output = (wxUint16) input;
101
102 return 1;
103 }
104 else if (input >= 0x110000)
105 {
106 return wxCONV_FAILED;
107 }
108 else
109 {
110 if (output)
111 {
112 *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
113 *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
114 }
115
116 return 2;
117 }
118 }
119
120 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
121 {
122 if ((*input < 0xd800) || (*input > 0xdfff))
123 {
124 output = *input;
125 return 1;
126 }
127 else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
128 {
129 output = *input;
130 return wxCONV_FAILED;
131 }
132 else
133 {
134 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
135 return 2;
136 }
137 }
138
139 #ifdef WC_UTF16
140 typedef wchar_t wxDecodeSurrogate_t;
141 #else // !WC_UTF16
142 typedef wxUint16 wxDecodeSurrogate_t;
143 #endif // WC_UTF16/!WC_UTF16
144
145 // returns the next UTF-32 character from the wchar_t buffer and advances the
146 // pointer to the character after this one
147 //
148 // if an invalid character is found, *pSrc is set to NULL, the caller must
149 // check for this
150 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
151 {
152 wxUint32 out;
153 const size_t
154 n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
155 if ( n == wxCONV_FAILED )
156 *pSrc = NULL;
157 else
158 *pSrc += n;
159
160 return out;
161 }
162
163 // ----------------------------------------------------------------------------
164 // wxMBConv
165 // ----------------------------------------------------------------------------
166
167 size_t
168 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
169 const char *src, size_t srcLen) const
170 {
171 // although new conversion classes are supposed to implement this function
172 // directly, the existins ones only implement the old MB2WC() and so, to
173 // avoid to have to rewrite all conversion classes at once, we provide a
174 // default (but not efficient) implementation of this one in terms of the
175 // old function by copying the input to ensure that it's NUL-terminated and
176 // then using MB2WC() to convert it
177
178 // the number of chars [which would be] written to dst [if it were not NULL]
179 size_t dstWritten = 0;
180
181 // the number of NULs terminating this string
182 size_t nulLen = 0; // not really needed, but just to avoid warnings
183
184 // if we were not given the input size we just have to assume that the
185 // string is properly terminated as we have no way of knowing how long it
186 // is anyhow, but if we do have the size check whether there are enough
187 // NULs at the end
188 wxCharBuffer bufTmp;
189 const char *srcEnd;
190 if ( srcLen != wxNO_LEN )
191 {
192 // we need to know how to find the end of this string
193 nulLen = GetMBNulLen();
194 if ( nulLen == wxCONV_FAILED )
195 return wxCONV_FAILED;
196
197 // if there are enough NULs we can avoid the copy
198 if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
199 {
200 // make a copy in order to properly NUL-terminate the string
201 bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
202 char * const p = bufTmp.data();
203 memcpy(p, src, srcLen);
204 for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
205 *s = '\0';
206
207 src = bufTmp;
208 }
209
210 srcEnd = src + srcLen;
211 }
212 else // quit after the first loop iteration
213 {
214 srcEnd = NULL;
215 }
216
217 for ( ;; )
218 {
219 // try to convert the current chunk
220 size_t lenChunk = MB2WC(NULL, src, 0);
221 if ( lenChunk == wxCONV_FAILED )
222 return wxCONV_FAILED;
223
224 lenChunk++; // for the L'\0' at the end of this chunk
225
226 dstWritten += lenChunk;
227
228 if ( lenChunk == 1 )
229 {
230 // nothing left in the input string, conversion succeeded
231 break;
232 }
233
234 if ( dst )
235 {
236 if ( dstWritten > dstLen )
237 return wxCONV_FAILED;
238
239 if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
240 return wxCONV_FAILED;
241
242 dst += lenChunk;
243 }
244
245 if ( !srcEnd )
246 {
247 // we convert just one chunk in this case as this is the entire
248 // string anyhow
249 break;
250 }
251
252 // advance the input pointer past the end of this chunk
253 while ( NotAllNULs(src, nulLen) )
254 {
255 // notice that we must skip over multiple bytes here as we suppose
256 // that if NUL takes 2 or 4 bytes, then all the other characters do
257 // too and so if advanced by a single byte we might erroneously
258 // detect sequences of NUL bytes in the middle of the input
259 src += nulLen;
260 }
261
262 src += nulLen; // skipping over its terminator as well
263
264 // note that ">=" (and not just "==") is needed here as the terminator
265 // we skipped just above could be inside or just after the buffer
266 // delimited by inEnd
267 if ( src >= srcEnd )
268 break;
269 }
270
271 return dstWritten;
272 }
273
274 size_t
275 wxMBConv::FromWChar(char *dst, size_t dstLen,
276 const wchar_t *src, size_t srcLen) const
277 {
278 // the number of chars [which would be] written to dst [if it were not NULL]
279 size_t dstWritten = 0;
280
281 // make a copy of the input string unless it is already properly
282 // NUL-terminated
283 //
284 // if we don't know its length we have no choice but to assume that it is,
285 // indeed, properly terminated
286 wxWCharBuffer bufTmp;
287 if ( srcLen == wxNO_LEN )
288 {
289 srcLen = wxWcslen(src) + 1;
290 }
291 else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
292 {
293 // make a copy in order to properly NUL-terminate the string
294 bufTmp = wxWCharBuffer(srcLen);
295 memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
296 src = bufTmp;
297 }
298
299 const size_t lenNul = GetMBNulLen();
300 for ( const wchar_t * const srcEnd = src + srcLen;
301 src < srcEnd;
302 src += wxWcslen(src) + 1 /* skip L'\0' too */ )
303 {
304 // try to convert the current chunk
305 size_t lenChunk = WC2MB(NULL, src, 0);
306
307 if ( lenChunk == wxCONV_FAILED )
308 return wxCONV_FAILED;
309
310 lenChunk += lenNul;
311 dstWritten += lenChunk;
312
313 if ( dst )
314 {
315 if ( dstWritten > dstLen )
316 return wxCONV_FAILED;
317
318 if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
319 return wxCONV_FAILED;
320
321 dst += lenChunk;
322 }
323 }
324
325 return dstWritten;
326 }
327
328 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
329 {
330 size_t rc = ToWChar(outBuff, outLen, inBuff);
331 if ( rc != wxCONV_FAILED )
332 {
333 // ToWChar() returns the buffer length, i.e. including the trailing
334 // NUL, while this method doesn't take it into account
335 rc--;
336 }
337
338 return rc;
339 }
340
341 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
342 {
343 size_t rc = FromWChar(outBuff, outLen, inBuff);
344 if ( rc != wxCONV_FAILED )
345 {
346 rc -= GetMBNulLen();
347 }
348
349 return rc;
350 }
351
352 wxMBConv::~wxMBConv()
353 {
354 // nothing to do here (necessary for Darwin linking probably)
355 }
356
357 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
358 {
359 if ( psz )
360 {
361 // calculate the length of the buffer needed first
362 const size_t nLen = MB2WC(NULL, psz, 0);
363 if ( nLen != wxCONV_FAILED )
364 {
365 // now do the actual conversion
366 wxWCharBuffer buf(nLen /* +1 added implicitly */);
367
368 // +1 for the trailing NULL
369 if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
370 return buf;
371 }
372 }
373
374 return wxWCharBuffer();
375 }
376
377 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
378 {
379 if ( pwz )
380 {
381 const size_t nLen = WC2MB(NULL, pwz, 0);
382 if ( nLen != wxCONV_FAILED )
383 {
384 // extra space for trailing NUL(s)
385 static const size_t extraLen = GetMaxMBNulLen();
386
387 wxCharBuffer buf(nLen + extraLen - 1);
388 if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
389 return buf;
390 }
391 }
392
393 return wxCharBuffer();
394 }
395
396 const wxWCharBuffer
397 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
398 {
399 const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
400 if ( dstLen != wxCONV_FAILED )
401 {
402 wxWCharBuffer wbuf(dstLen - 1);
403 if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
404 {
405 if ( outLen )
406 {
407 *outLen = dstLen;
408 if ( wbuf[dstLen - 1] == L'\0' )
409 (*outLen)--;
410 }
411
412 return wbuf;
413 }
414 }
415
416 if ( outLen )
417 *outLen = 0;
418
419 return wxWCharBuffer();
420 }
421
422 const wxCharBuffer
423 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
424 {
425 const size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
426 if ( dstLen != wxCONV_FAILED )
427 {
428 wxCharBuffer buf(dstLen - 1);
429 if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
430 {
431 if ( outLen )
432 {
433 *outLen = dstLen;
434
435 const size_t nulLen = GetMBNulLen();
436 if ( !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
437 {
438 // in this case the output is NUL-terminated and we're not
439 // supposed to count NUL
440 (*outLen) -= nulLen;
441 }
442 }
443
444 return buf;
445 }
446 }
447
448 if ( outLen )
449 *outLen = 0;
450
451 return wxCharBuffer();
452 }
453
454 // ----------------------------------------------------------------------------
455 // wxMBConvLibc
456 // ----------------------------------------------------------------------------
457
458 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
459 {
460 return wxMB2WC(buf, psz, n);
461 }
462
463 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
464 {
465 return wxWC2MB(buf, psz, n);
466 }
467
468 // ----------------------------------------------------------------------------
469 // wxConvBrokenFileNames
470 // ----------------------------------------------------------------------------
471
472 #ifdef __UNIX__
473
474 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
475 {
476 if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
477 || wxStricmp(charset, _T("UTF8")) == 0 )
478 m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
479 else
480 m_conv = new wxCSConv(charset);
481 }
482
483 #endif // __UNIX__
484
485 // ----------------------------------------------------------------------------
486 // UTF-7
487 // ----------------------------------------------------------------------------
488
489 // Implementation (C) 2004 Fredrik Roubert
490
491 //
492 // BASE64 decoding table
493 //
494 static const unsigned char utf7unb64[] =
495 {
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
502 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
503 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
505 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
506 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
507 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
509 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
510 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
511 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
528 };
529
530 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
531 {
532 size_t len = 0;
533
534 while ( *psz && (!buf || (len < n)) )
535 {
536 unsigned char cc = *psz++;
537 if (cc != '+')
538 {
539 // plain ASCII char
540 if (buf)
541 *buf++ = cc;
542 len++;
543 }
544 else if (*psz == '-')
545 {
546 // encoded plus sign
547 if (buf)
548 *buf++ = cc;
549 len++;
550 psz++;
551 }
552 else // start of BASE64 encoded string
553 {
554 bool lsb, ok;
555 unsigned int d, l;
556 for ( ok = lsb = false, d = 0, l = 0;
557 (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
558 psz++ )
559 {
560 d <<= 6;
561 d += cc;
562 for (l += 6; l >= 8; lsb = !lsb)
563 {
564 unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
565 if (lsb)
566 {
567 if (buf)
568 *buf++ |= c;
569 len ++;
570 }
571 else
572 {
573 if (buf)
574 *buf = (wchar_t)(c << 8);
575 }
576
577 ok = true;
578 }
579 }
580
581 if ( !ok )
582 {
583 // in valid UTF7 we should have valid characters after '+'
584 return wxCONV_FAILED;
585 }
586
587 if (*psz == '-')
588 psz++;
589 }
590 }
591
592 if ( buf && (len < n) )
593 *buf = '\0';
594
595 return len;
596 }
597
598 //
599 // BASE64 encoding table
600 //
601 static const unsigned char utf7enb64[] =
602 {
603 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
604 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
605 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
606 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
607 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
608 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
609 'w', 'x', 'y', 'z', '0', '1', '2', '3',
610 '4', '5', '6', '7', '8', '9', '+', '/'
611 };
612
613 //
614 // UTF-7 encoding table
615 //
616 // 0 - Set D (directly encoded characters)
617 // 1 - Set O (optional direct characters)
618 // 2 - whitespace characters (optional)
619 // 3 - special characters
620 //
621 static const unsigned char utf7encode[128] =
622 {
623 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
624 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
625 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
627 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
629 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
631 };
632
633 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
634 {
635 size_t len = 0;
636
637 while (*psz && ((!buf) || (len < n)))
638 {
639 wchar_t cc = *psz++;
640 if (cc < 0x80 && utf7encode[cc] < 1)
641 {
642 // plain ASCII char
643 if (buf)
644 *buf++ = (char)cc;
645
646 len++;
647 }
648 #ifndef WC_UTF16
649 else if (((wxUint32)cc) > 0xffff)
650 {
651 // no surrogate pair generation (yet?)
652 return wxCONV_FAILED;
653 }
654 #endif
655 else
656 {
657 if (buf)
658 *buf++ = '+';
659
660 len++;
661 if (cc != '+')
662 {
663 // BASE64 encode string
664 unsigned int lsb, d, l;
665 for (d = 0, l = 0; /*nothing*/; psz++)
666 {
667 for (lsb = 0; lsb < 2; lsb ++)
668 {
669 d <<= 8;
670 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
671
672 for (l += 8; l >= 6; )
673 {
674 l -= 6;
675 if (buf)
676 *buf++ = utf7enb64[(d >> l) % 64];
677 len++;
678 }
679 }
680
681 cc = *psz;
682 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
683 break;
684 }
685
686 if (l != 0)
687 {
688 if (buf)
689 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
690
691 len++;
692 }
693 }
694
695 if (buf)
696 *buf++ = '-';
697 len++;
698 }
699 }
700
701 if (buf && (len < n))
702 *buf = 0;
703
704 return len;
705 }
706
707 // ----------------------------------------------------------------------------
708 // UTF-8
709 // ----------------------------------------------------------------------------
710
711 static wxUint32 utf8_max[]=
712 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
713
714 // boundaries of the private use area we use to (temporarily) remap invalid
715 // characters invalid in a UTF-8 encoded string
716 const wxUint32 wxUnicodePUA = 0x100000;
717 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
718
719 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
720 {
721 size_t len = 0;
722
723 while (*psz && ((!buf) || (len < n)))
724 {
725 const char *opsz = psz;
726 bool invalid = false;
727 unsigned char cc = *psz++, fc = cc;
728 unsigned cnt;
729 for (cnt = 0; fc & 0x80; cnt++)
730 fc <<= 1;
731
732 if (!cnt)
733 {
734 // plain ASCII char
735 if (buf)
736 *buf++ = cc;
737 len++;
738
739 // escape the escape character for octal escapes
740 if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
741 && cc == '\\' && (!buf || len < n))
742 {
743 if (buf)
744 *buf++ = cc;
745 len++;
746 }
747 }
748 else
749 {
750 cnt--;
751 if (!cnt)
752 {
753 // invalid UTF-8 sequence
754 invalid = true;
755 }
756 else
757 {
758 unsigned ocnt = cnt - 1;
759 wxUint32 res = cc & (0x3f >> cnt);
760 while (cnt--)
761 {
762 cc = *psz;
763 if ((cc & 0xC0) != 0x80)
764 {
765 // invalid UTF-8 sequence
766 invalid = true;
767 break;
768 }
769
770 psz++;
771 res = (res << 6) | (cc & 0x3f);
772 }
773
774 if (invalid || res <= utf8_max[ocnt])
775 {
776 // illegal UTF-8 encoding
777 invalid = true;
778 }
779 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
780 res >= wxUnicodePUA && res < wxUnicodePUAEnd)
781 {
782 // if one of our PUA characters turns up externally
783 // it must also be treated as an illegal sequence
784 // (a bit like you have to escape an escape character)
785 invalid = true;
786 }
787 else
788 {
789 #ifdef WC_UTF16
790 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
791 size_t pa = encode_utf16(res, (wxUint16 *)buf);
792 if (pa == wxCONV_FAILED)
793 {
794 invalid = true;
795 }
796 else
797 {
798 if (buf)
799 buf += pa;
800 len += pa;
801 }
802 #else // !WC_UTF16
803 if (buf)
804 *buf++ = (wchar_t)res;
805 len++;
806 #endif // WC_UTF16/!WC_UTF16
807 }
808 }
809
810 if (invalid)
811 {
812 if (m_options & MAP_INVALID_UTF8_TO_PUA)
813 {
814 while (opsz < psz && (!buf || len < n))
815 {
816 #ifdef WC_UTF16
817 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
818 size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
819 wxASSERT(pa != wxCONV_FAILED);
820 if (buf)
821 buf += pa;
822 opsz++;
823 len += pa;
824 #else
825 if (buf)
826 *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
827 opsz++;
828 len++;
829 #endif
830 }
831 }
832 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
833 {
834 while (opsz < psz && (!buf || len < n))
835 {
836 if ( buf && len + 3 < n )
837 {
838 unsigned char on = *opsz;
839 *buf++ = L'\\';
840 *buf++ = (wchar_t)( L'0' + on / 0100 );
841 *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
842 *buf++ = (wchar_t)( L'0' + on % 010 );
843 }
844
845 opsz++;
846 len += 4;
847 }
848 }
849 else // MAP_INVALID_UTF8_NOT
850 {
851 return wxCONV_FAILED;
852 }
853 }
854 }
855 }
856
857 if (buf && (len < n))
858 *buf = 0;
859
860 return len;
861 }
862
863 static inline bool isoctal(wchar_t wch)
864 {
865 return L'0' <= wch && wch <= L'7';
866 }
867
868 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
869 {
870 size_t len = 0;
871
872 while (*psz && ((!buf) || (len < n)))
873 {
874 wxUint32 cc;
875
876 #ifdef WC_UTF16
877 // cast is ok for WC_UTF16
878 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
879 psz += (pa == wxCONV_FAILED) ? 1 : pa;
880 #else
881 cc = (*psz++) & 0x7fffffff;
882 #endif
883
884 if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
885 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
886 {
887 if (buf)
888 *buf++ = (char)(cc - wxUnicodePUA);
889 len++;
890 }
891 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
892 && cc == L'\\' && psz[0] == L'\\' )
893 {
894 if (buf)
895 *buf++ = (char)cc;
896 psz++;
897 len++;
898 }
899 else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
900 cc == L'\\' &&
901 isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
902 {
903 if (buf)
904 {
905 *buf++ = (char) ((psz[0] - L'0') * 0100 +
906 (psz[1] - L'0') * 010 +
907 (psz[2] - L'0'));
908 }
909
910 psz += 3;
911 len++;
912 }
913 else
914 {
915 unsigned cnt;
916 for (cnt = 0; cc > utf8_max[cnt]; cnt++)
917 {
918 }
919
920 if (!cnt)
921 {
922 // plain ASCII char
923 if (buf)
924 *buf++ = (char) cc;
925 len++;
926 }
927 else
928 {
929 len += cnt + 1;
930 if (buf)
931 {
932 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
933 while (cnt--)
934 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
935 }
936 }
937 }
938 }
939
940 if (buf && (len < n))
941 *buf = 0;
942
943 return len;
944 }
945
946 // ============================================================================
947 // UTF-16
948 // ============================================================================
949
950 #ifdef WORDS_BIGENDIAN
951 #define wxMBConvUTF16straight wxMBConvUTF16BE
952 #define wxMBConvUTF16swap wxMBConvUTF16LE
953 #else
954 #define wxMBConvUTF16swap wxMBConvUTF16BE
955 #define wxMBConvUTF16straight wxMBConvUTF16LE
956 #endif
957
958 /* static */
959 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
960 {
961 if ( srcLen == wxNO_LEN )
962 {
963 // count the number of bytes in input, including the trailing NULs
964 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
965 for ( srcLen = 1; *inBuff++; srcLen++ )
966 ;
967
968 srcLen *= BYTES_PER_CHAR;
969 }
970 else // we already have the length
971 {
972 // we can only convert an entire number of UTF-16 characters
973 if ( srcLen % BYTES_PER_CHAR )
974 return wxCONV_FAILED;
975 }
976
977 return srcLen;
978 }
979
980 // case when in-memory representation is UTF-16 too
981 #ifdef WC_UTF16
982
983 // ----------------------------------------------------------------------------
984 // conversions without endianness change
985 // ----------------------------------------------------------------------------
986
987 size_t
988 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
989 const char *src, size_t srcLen) const
990 {
991 // set up the scene for using memcpy() (which is presumably more efficient
992 // than copying the bytes one by one)
993 srcLen = GetLength(src, srcLen);
994 if ( srcLen == wxNO_LEN )
995 return wxCONV_FAILED;
996
997 const size_t inLen = srcLen / BYTES_PER_CHAR;
998 if ( dst )
999 {
1000 if ( dstLen < inLen )
1001 return wxCONV_FAILED;
1002
1003 memcpy(dst, src, srcLen);
1004 }
1005
1006 return inLen;
1007 }
1008
1009 size_t
1010 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1011 const wchar_t *src, size_t srcLen) const
1012 {
1013 if ( srcLen == wxNO_LEN )
1014 srcLen = wxWcslen(src) + 1;
1015
1016 srcLen *= BYTES_PER_CHAR;
1017
1018 if ( dst )
1019 {
1020 if ( dstLen < srcLen )
1021 return wxCONV_FAILED;
1022
1023 memcpy(dst, src, srcLen);
1024 }
1025
1026 return srcLen;
1027 }
1028
1029 // ----------------------------------------------------------------------------
1030 // endian-reversing conversions
1031 // ----------------------------------------------------------------------------
1032
1033 size_t
1034 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1035 const char *src, size_t srcLen) const
1036 {
1037 srcLen = GetLength(src, srcLen);
1038 if ( srcLen == wxNO_LEN )
1039 return wxCONV_FAILED;
1040
1041 srcLen /= BYTES_PER_CHAR;
1042
1043 if ( dst )
1044 {
1045 if ( dstLen < srcLen )
1046 return wxCONV_FAILED;
1047
1048 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1049 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1050 {
1051 *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1052 }
1053 }
1054
1055 return srcLen;
1056 }
1057
1058 size_t
1059 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1060 const wchar_t *src, size_t srcLen) const
1061 {
1062 if ( srcLen == wxNO_LEN )
1063 srcLen = wxWcslen(src) + 1;
1064
1065 srcLen *= BYTES_PER_CHAR;
1066
1067 if ( dst )
1068 {
1069 if ( dstLen < srcLen )
1070 return wxCONV_FAILED;
1071
1072 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1073 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1074 {
1075 *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1076 }
1077 }
1078
1079 return srcLen;
1080 }
1081
1082 #else // !WC_UTF16: wchar_t is UTF-32
1083
1084 // ----------------------------------------------------------------------------
1085 // conversions without endianness change
1086 // ----------------------------------------------------------------------------
1087
1088 size_t
1089 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1090 const char *src, size_t srcLen) const
1091 {
1092 srcLen = GetLength(src, srcLen);
1093 if ( srcLen == wxNO_LEN )
1094 return wxCONV_FAILED;
1095
1096 const size_t inLen = srcLen / BYTES_PER_CHAR;
1097 if ( !dst )
1098 {
1099 // optimization: return maximal space which could be needed for this
1100 // string even if the real size could be smaller if the buffer contains
1101 // any surrogates
1102 return inLen;
1103 }
1104
1105 size_t outLen = 0;
1106 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1107 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1108 {
1109 const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1110 if ( !inBuff )
1111 return wxCONV_FAILED;
1112
1113 if ( ++outLen > dstLen )
1114 return wxCONV_FAILED;
1115
1116 *dst++ = ch;
1117 }
1118
1119
1120 return outLen;
1121 }
1122
1123 size_t
1124 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1125 const wchar_t *src, size_t srcLen) const
1126 {
1127 if ( srcLen == wxNO_LEN )
1128 srcLen = wxWcslen(src) + 1;
1129
1130 size_t outLen = 0;
1131 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1132 for ( size_t n = 0; n < srcLen; n++ )
1133 {
1134 wxUint16 cc[2];
1135 const size_t numChars = encode_utf16(*src++, cc);
1136 if ( numChars == wxCONV_FAILED )
1137 return wxCONV_FAILED;
1138
1139 outLen += numChars * BYTES_PER_CHAR;
1140 if ( outBuff )
1141 {
1142 if ( outLen > dstLen )
1143 return wxCONV_FAILED;
1144
1145 *outBuff++ = cc[0];
1146 if ( numChars == 2 )
1147 {
1148 // second character of a surrogate
1149 *outBuff++ = cc[1];
1150 }
1151 }
1152 }
1153
1154 return outLen;
1155 }
1156
1157 // ----------------------------------------------------------------------------
1158 // endian-reversing conversions
1159 // ----------------------------------------------------------------------------
1160
1161 size_t
1162 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1163 const char *src, size_t srcLen) const
1164 {
1165 srcLen = GetLength(src, srcLen);
1166 if ( srcLen == wxNO_LEN )
1167 return wxCONV_FAILED;
1168
1169 const size_t inLen = srcLen / BYTES_PER_CHAR;
1170 if ( !dst )
1171 {
1172 // optimization: return maximal space which could be needed for this
1173 // string even if the real size could be smaller if the buffer contains
1174 // any surrogates
1175 return inLen;
1176 }
1177
1178 size_t outLen = 0;
1179 const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1180 for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1181 {
1182 wxUint32 ch;
1183 wxUint16 tmp[2];
1184
1185 tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1186 inBuff++;
1187 tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1188
1189 const size_t numChars = decode_utf16(tmp, ch);
1190 if ( numChars == wxCONV_FAILED )
1191 return wxCONV_FAILED;
1192
1193 if ( numChars == 2 )
1194 inBuff++;
1195
1196 if ( ++outLen > dstLen )
1197 return wxCONV_FAILED;
1198
1199 *dst++ = ch;
1200 }
1201
1202
1203 return outLen;
1204 }
1205
1206 size_t
1207 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1208 const wchar_t *src, size_t srcLen) const
1209 {
1210 if ( srcLen == wxNO_LEN )
1211 srcLen = wxWcslen(src) + 1;
1212
1213 size_t outLen = 0;
1214 wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1215 for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1216 {
1217 wxUint16 cc[2];
1218 const size_t numChars = encode_utf16(*src, cc);
1219 if ( numChars == wxCONV_FAILED )
1220 return wxCONV_FAILED;
1221
1222 outLen += numChars * BYTES_PER_CHAR;
1223 if ( outBuff )
1224 {
1225 if ( outLen > dstLen )
1226 return wxCONV_FAILED;
1227
1228 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1229 if ( numChars == 2 )
1230 {
1231 // second character of a surrogate
1232 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1233 }
1234 }
1235 }
1236
1237 return outLen;
1238 }
1239
1240 #endif // WC_UTF16/!WC_UTF16
1241
1242
1243 // ============================================================================
1244 // UTF-32
1245 // ============================================================================
1246
1247 #ifdef WORDS_BIGENDIAN
1248 #define wxMBConvUTF32straight wxMBConvUTF32BE
1249 #define wxMBConvUTF32swap wxMBConvUTF32LE
1250 #else
1251 #define wxMBConvUTF32swap wxMBConvUTF32BE
1252 #define wxMBConvUTF32straight wxMBConvUTF32LE
1253 #endif
1254
1255
1256 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1257 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1258
1259 /* static */
1260 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1261 {
1262 if ( srcLen == wxNO_LEN )
1263 {
1264 // count the number of bytes in input, including the trailing NULs
1265 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1266 for ( srcLen = 1; *inBuff++; srcLen++ )
1267 ;
1268
1269 srcLen *= BYTES_PER_CHAR;
1270 }
1271 else // we already have the length
1272 {
1273 // we can only convert an entire number of UTF-32 characters
1274 if ( srcLen % BYTES_PER_CHAR )
1275 return wxCONV_FAILED;
1276 }
1277
1278 return srcLen;
1279 }
1280
1281 // case when in-memory representation is UTF-16
1282 #ifdef WC_UTF16
1283
1284 // ----------------------------------------------------------------------------
1285 // conversions without endianness change
1286 // ----------------------------------------------------------------------------
1287
1288 size_t
1289 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1290 const char *src, size_t srcLen) const
1291 {
1292 srcLen = GetLength(src, srcLen);
1293 if ( srcLen == wxNO_LEN )
1294 return wxCONV_FAILED;
1295
1296 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1297 const size_t inLen = srcLen / BYTES_PER_CHAR;
1298 size_t outLen = 0;
1299 for ( size_t n = 0; n < inLen; n++ )
1300 {
1301 wxUint16 cc[2];
1302 const size_t numChars = encode_utf16(*inBuff++, cc);
1303 if ( numChars == wxCONV_FAILED )
1304 return wxCONV_FAILED;
1305
1306 outLen += numChars;
1307 if ( dst )
1308 {
1309 if ( outLen > dstLen )
1310 return wxCONV_FAILED;
1311
1312 *dst++ = cc[0];
1313 if ( numChars == 2 )
1314 {
1315 // second character of a surrogate
1316 *dst++ = cc[1];
1317 }
1318 }
1319 }
1320
1321 return outLen;
1322 }
1323
1324 size_t
1325 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1326 const wchar_t *src, size_t srcLen) const
1327 {
1328 if ( srcLen == wxNO_LEN )
1329 srcLen = wxWcslen(src) + 1;
1330
1331 if ( !dst )
1332 {
1333 // optimization: return maximal space which could be needed for this
1334 // string instead of the exact amount which could be less if there are
1335 // any surrogates in the input
1336 //
1337 // we consider that surrogates are rare enough to make it worthwhile to
1338 // avoid running the loop below at the cost of slightly extra memory
1339 // consumption
1340 return srcLen * BYTES_PER_CHAR;
1341 }
1342
1343 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1344 size_t outLen = 0;
1345 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1346 {
1347 const wxUint32 ch = wxDecodeSurrogate(&src);
1348 if ( !src )
1349 return wxCONV_FAILED;
1350
1351 outLen += BYTES_PER_CHAR;
1352
1353 if ( outLen > dstLen )
1354 return wxCONV_FAILED;
1355
1356 *outBuff++ = ch;
1357 }
1358
1359 return outLen;
1360 }
1361
1362 // ----------------------------------------------------------------------------
1363 // endian-reversing conversions
1364 // ----------------------------------------------------------------------------
1365
1366 size_t
1367 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1368 const char *src, size_t srcLen) const
1369 {
1370 srcLen = GetLength(src, srcLen);
1371 if ( srcLen == wxNO_LEN )
1372 return wxCONV_FAILED;
1373
1374 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1375 const size_t inLen = srcLen / BYTES_PER_CHAR;
1376 size_t outLen = 0;
1377 for ( size_t n = 0; n < inLen; n++, inBuff++ )
1378 {
1379 wxUint16 cc[2];
1380 const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1381 if ( numChars == wxCONV_FAILED )
1382 return wxCONV_FAILED;
1383
1384 outLen += numChars;
1385 if ( dst )
1386 {
1387 if ( outLen > dstLen )
1388 return wxCONV_FAILED;
1389
1390 *dst++ = cc[0];
1391 if ( numChars == 2 )
1392 {
1393 // second character of a surrogate
1394 *dst++ = cc[1];
1395 }
1396 }
1397 }
1398
1399 return outLen;
1400 }
1401
1402 size_t
1403 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1404 const wchar_t *src, size_t srcLen) const
1405 {
1406 if ( srcLen == wxNO_LEN )
1407 srcLen = wxWcslen(src) + 1;
1408
1409 if ( !dst )
1410 {
1411 // optimization: return maximal space which could be needed for this
1412 // string instead of the exact amount which could be less if there are
1413 // any surrogates in the input
1414 //
1415 // we consider that surrogates are rare enough to make it worthwhile to
1416 // avoid running the loop below at the cost of slightly extra memory
1417 // consumption
1418 return srcLen*BYTES_PER_CHAR;
1419 }
1420
1421 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1422 size_t outLen = 0;
1423 for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1424 {
1425 const wxUint32 ch = wxDecodeSurrogate(&src);
1426 if ( !src )
1427 return wxCONV_FAILED;
1428
1429 outLen += BYTES_PER_CHAR;
1430
1431 if ( outLen > dstLen )
1432 return wxCONV_FAILED;
1433
1434 *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1435 }
1436
1437 return outLen;
1438 }
1439
1440 #else // !WC_UTF16: wchar_t is UTF-32
1441
1442 // ----------------------------------------------------------------------------
1443 // conversions without endianness change
1444 // ----------------------------------------------------------------------------
1445
1446 size_t
1447 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1448 const char *src, size_t srcLen) const
1449 {
1450 // use memcpy() as it should be much faster than hand-written loop
1451 srcLen = GetLength(src, srcLen);
1452 if ( srcLen == wxNO_LEN )
1453 return wxCONV_FAILED;
1454
1455 const size_t inLen = srcLen/BYTES_PER_CHAR;
1456 if ( dst )
1457 {
1458 if ( dstLen < inLen )
1459 return wxCONV_FAILED;
1460
1461 memcpy(dst, src, srcLen);
1462 }
1463
1464 return inLen;
1465 }
1466
1467 size_t
1468 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1469 const wchar_t *src, size_t srcLen) const
1470 {
1471 if ( srcLen == wxNO_LEN )
1472 srcLen = wxWcslen(src) + 1;
1473
1474 srcLen *= BYTES_PER_CHAR;
1475
1476 if ( dst )
1477 {
1478 if ( dstLen < srcLen )
1479 return wxCONV_FAILED;
1480
1481 memcpy(dst, src, srcLen);
1482 }
1483
1484 return srcLen;
1485 }
1486
1487 // ----------------------------------------------------------------------------
1488 // endian-reversing conversions
1489 // ----------------------------------------------------------------------------
1490
1491 size_t
1492 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1493 const char *src, size_t srcLen) const
1494 {
1495 srcLen = GetLength(src, srcLen);
1496 if ( srcLen == wxNO_LEN )
1497 return wxCONV_FAILED;
1498
1499 srcLen /= BYTES_PER_CHAR;
1500
1501 if ( dst )
1502 {
1503 if ( dstLen < srcLen )
1504 return wxCONV_FAILED;
1505
1506 const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1507 for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1508 {
1509 *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1510 }
1511 }
1512
1513 return srcLen;
1514 }
1515
1516 size_t
1517 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1518 const wchar_t *src, size_t srcLen) const
1519 {
1520 if ( srcLen == wxNO_LEN )
1521 srcLen = wxWcslen(src) + 1;
1522
1523 srcLen *= BYTES_PER_CHAR;
1524
1525 if ( dst )
1526 {
1527 if ( dstLen < srcLen )
1528 return wxCONV_FAILED;
1529
1530 wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1531 for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1532 {
1533 *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1534 }
1535 }
1536
1537 return srcLen;
1538 }
1539
1540 #endif // WC_UTF16/!WC_UTF16
1541
1542
1543 // ============================================================================
1544 // The classes doing conversion using the iconv_xxx() functions
1545 // ============================================================================
1546
1547 #ifdef HAVE_ICONV
1548
1549 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1550 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1551 // (unless there's yet another bug in glibc) the only case when iconv()
1552 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1553 // left in the input buffer -- when _real_ error occurs,
1554 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1555 // iconv() failure.
1556 // [This bug does not appear in glibc 2.2.]
1557 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1558 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1559 (errno != E2BIG || bufLeft != 0))
1560 #else
1561 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1562 #endif
1563
1564 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1565
1566 #define ICONV_T_INVALID ((iconv_t)-1)
1567
1568 #if SIZEOF_WCHAR_T == 4
1569 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1570 #define WC_ENC wxFONTENCODING_UTF32
1571 #elif SIZEOF_WCHAR_T == 2
1572 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1573 #define WC_ENC wxFONTENCODING_UTF16
1574 #else // sizeof(wchar_t) != 2 nor 4
1575 // does this ever happen?
1576 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1577 #endif
1578
1579 // ----------------------------------------------------------------------------
1580 // wxMBConv_iconv: encapsulates an iconv character set
1581 // ----------------------------------------------------------------------------
1582
1583 class wxMBConv_iconv : public wxMBConv
1584 {
1585 public:
1586 wxMBConv_iconv(const wxChar *name);
1587 virtual ~wxMBConv_iconv();
1588
1589 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1590 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1591
1592 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1593 virtual size_t GetMBNulLen() const;
1594
1595 virtual wxMBConv *Clone() const
1596 {
1597 wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1598 p->m_minMBCharWidth = m_minMBCharWidth;
1599 return p;
1600 }
1601
1602 bool IsOk() const
1603 { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1604
1605 protected:
1606 // the iconv handlers used to translate from multibyte
1607 // to wide char and in the other direction
1608 iconv_t m2w,
1609 w2m;
1610
1611 #if wxUSE_THREADS
1612 // guards access to m2w and w2m objects
1613 wxMutex m_iconvMutex;
1614 #endif
1615
1616 private:
1617 // the name (for iconv_open()) of a wide char charset -- if none is
1618 // available on this machine, it will remain NULL
1619 static wxString ms_wcCharsetName;
1620
1621 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1622 // different endian-ness than the native one
1623 static bool ms_wcNeedsSwap;
1624
1625
1626 // name of the encoding handled by this conversion
1627 wxString m_name;
1628
1629 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1630 // initially
1631 size_t m_minMBCharWidth;
1632 };
1633
1634 // make the constructor available for unit testing
1635 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1636 {
1637 wxMBConv_iconv* result = new wxMBConv_iconv( name );
1638 if ( !result->IsOk() )
1639 {
1640 delete result;
1641 return 0;
1642 }
1643
1644 return result;
1645 }
1646
1647 wxString wxMBConv_iconv::ms_wcCharsetName;
1648 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1649
1650 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1651 : m_name(name)
1652 {
1653 m_minMBCharWidth = 0;
1654
1655 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1656 // names for the charsets
1657 const wxCharBuffer cname(wxString(name).ToAscii());
1658
1659 // check for charset that represents wchar_t:
1660 if ( ms_wcCharsetName.empty() )
1661 {
1662 wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1663
1664 #if wxUSE_FONTMAP
1665 const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1666 #else // !wxUSE_FONTMAP
1667 static const wxChar *names[] =
1668 {
1669 #if SIZEOF_WCHAR_T == 4
1670 _T("UCS-4"),
1671 #elif SIZEOF_WCHAR_T = 2
1672 _T("UCS-2"),
1673 #endif
1674 NULL
1675 };
1676 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1677
1678 for ( ; *names && ms_wcCharsetName.empty(); ++names )
1679 {
1680 const wxString nameCS(*names);
1681
1682 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1683 wxString nameXE(nameCS);
1684
1685 #ifdef WORDS_BIGENDIAN
1686 nameXE += _T("BE");
1687 #else // little endian
1688 nameXE += _T("LE");
1689 #endif
1690
1691 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1692 nameXE.c_str());
1693
1694 m2w = iconv_open(nameXE.ToAscii(), cname);
1695 if ( m2w == ICONV_T_INVALID )
1696 {
1697 // try charset w/o bytesex info (e.g. "UCS4")
1698 wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1699 nameCS.c_str());
1700 m2w = iconv_open(nameCS.ToAscii(), cname);
1701
1702 // and check for bytesex ourselves:
1703 if ( m2w != ICONV_T_INVALID )
1704 {
1705 char buf[2], *bufPtr;
1706 wchar_t wbuf[2], *wbufPtr;
1707 size_t insz, outsz;
1708 size_t res;
1709
1710 buf[0] = 'A';
1711 buf[1] = 0;
1712 wbuf[0] = 0;
1713 insz = 2;
1714 outsz = SIZEOF_WCHAR_T * 2;
1715 wbufPtr = wbuf;
1716 bufPtr = buf;
1717
1718 res = iconv(
1719 m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1720 (char**)&wbufPtr, &outsz);
1721
1722 if (ICONV_FAILED(res, insz))
1723 {
1724 wxLogLastError(wxT("iconv"));
1725 wxLogError(_("Conversion to charset '%s' doesn't work."),
1726 nameCS.c_str());
1727 }
1728 else // ok, can convert to this encoding, remember it
1729 {
1730 ms_wcCharsetName = nameCS;
1731 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1732 }
1733 }
1734 }
1735 else // use charset not requiring byte swapping
1736 {
1737 ms_wcCharsetName = nameXE;
1738 }
1739 }
1740
1741 wxLogTrace(TRACE_STRCONV,
1742 wxT("iconv wchar_t charset is \"%s\"%s"),
1743 ms_wcCharsetName.empty() ? _T("<none>")
1744 : ms_wcCharsetName.c_str(),
1745 ms_wcNeedsSwap ? _T(" (needs swap)")
1746 : _T(""));
1747 }
1748 else // we already have ms_wcCharsetName
1749 {
1750 m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1751 }
1752
1753 if ( ms_wcCharsetName.empty() )
1754 {
1755 w2m = ICONV_T_INVALID;
1756 }
1757 else
1758 {
1759 w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1760 if ( w2m == ICONV_T_INVALID )
1761 {
1762 wxLogTrace(TRACE_STRCONV,
1763 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1764 ms_wcCharsetName.c_str(), cname.data());
1765 }
1766 }
1767 }
1768
1769 wxMBConv_iconv::~wxMBConv_iconv()
1770 {
1771 if ( m2w != ICONV_T_INVALID )
1772 iconv_close(m2w);
1773 if ( w2m != ICONV_T_INVALID )
1774 iconv_close(w2m);
1775 }
1776
1777 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1778 {
1779 // find the string length: notice that must be done differently for
1780 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1781 size_t inbuf;
1782 const size_t nulLen = GetMBNulLen();
1783 switch ( nulLen )
1784 {
1785 default:
1786 return wxCONV_FAILED;
1787
1788 case 1:
1789 inbuf = strlen(psz); // arguably more optimized than our version
1790 break;
1791
1792 case 2:
1793 case 4:
1794 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1795 // they also have to start at character boundary and not span two
1796 // adjacent characters
1797 const char *p;
1798 for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1799 ;
1800 inbuf = p - psz;
1801 break;
1802 }
1803
1804 #if wxUSE_THREADS
1805 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1806 // Unfortunately there is a couple of global wxCSConv objects such as
1807 // wxConvLocal that are used all over wx code, so we have to make sure
1808 // the handle is used by at most one thread at the time. Otherwise
1809 // only a few wx classes would be safe to use from non-main threads
1810 // as MB<->WC conversion would fail "randomly".
1811 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1812 #endif // wxUSE_THREADS
1813
1814 size_t outbuf = n * SIZEOF_WCHAR_T;
1815 size_t res, cres;
1816 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1817 wchar_t *bufPtr = buf;
1818 const char *pszPtr = psz;
1819
1820 if (buf)
1821 {
1822 // have destination buffer, convert there
1823 cres = iconv(m2w,
1824 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1825 (char**)&bufPtr, &outbuf);
1826 res = n - (outbuf / SIZEOF_WCHAR_T);
1827
1828 if (ms_wcNeedsSwap)
1829 {
1830 // convert to native endianness
1831 for ( unsigned i = 0; i < res; i++ )
1832 buf[n] = WC_BSWAP(buf[i]);
1833 }
1834
1835 // NUL-terminate the string if there is any space left
1836 if (res < n)
1837 buf[res] = 0;
1838 }
1839 else
1840 {
1841 // no destination buffer... convert using temp buffer
1842 // to calculate destination buffer requirement
1843 wchar_t tbuf[8];
1844 res = 0;
1845
1846 do
1847 {
1848 bufPtr = tbuf;
1849 outbuf = 8 * SIZEOF_WCHAR_T;
1850
1851 cres = iconv(m2w,
1852 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1853 (char**)&bufPtr, &outbuf );
1854
1855 res += 8 - (outbuf / SIZEOF_WCHAR_T);
1856 }
1857 while ((cres == (size_t)-1) && (errno == E2BIG));
1858 }
1859
1860 if (ICONV_FAILED(cres, inbuf))
1861 {
1862 //VS: it is ok if iconv fails, hence trace only
1863 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1864 return wxCONV_FAILED;
1865 }
1866
1867 return res;
1868 }
1869
1870 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1871 {
1872 #if wxUSE_THREADS
1873 // NB: explained in MB2WC
1874 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1875 #endif
1876
1877 size_t inlen = wxWcslen(psz);
1878 size_t inbuf = inlen * SIZEOF_WCHAR_T;
1879 size_t outbuf = n;
1880 size_t res, cres;
1881
1882 wchar_t *tmpbuf = 0;
1883
1884 if (ms_wcNeedsSwap)
1885 {
1886 // need to copy to temp buffer to switch endianness
1887 // (doing WC_BSWAP twice on the original buffer won't help, as it
1888 // could be in read-only memory, or be accessed in some other thread)
1889 tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1890 for ( size_t i = 0; i < inlen; i++ )
1891 tmpbuf[n] = WC_BSWAP(psz[i]);
1892
1893 tmpbuf[inlen] = L'\0';
1894 psz = tmpbuf;
1895 }
1896
1897 if (buf)
1898 {
1899 // have destination buffer, convert there
1900 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1901
1902 res = n - outbuf;
1903
1904 // NB: iconv was given only wcslen(psz) characters on input, and so
1905 // it couldn't convert the trailing zero. Let's do it ourselves
1906 // if there's some room left for it in the output buffer.
1907 if (res < n)
1908 buf[0] = 0;
1909 }
1910 else
1911 {
1912 // no destination buffer: convert using temp buffer
1913 // to calculate destination buffer requirement
1914 char tbuf[16];
1915 res = 0;
1916 do
1917 {
1918 buf = tbuf;
1919 outbuf = 16;
1920
1921 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1922
1923 res += 16 - outbuf;
1924 }
1925 while ((cres == (size_t)-1) && (errno == E2BIG));
1926 }
1927
1928 if (ms_wcNeedsSwap)
1929 {
1930 free(tmpbuf);
1931 }
1932
1933 if (ICONV_FAILED(cres, inbuf))
1934 {
1935 wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1936 return wxCONV_FAILED;
1937 }
1938
1939 return res;
1940 }
1941
1942 size_t wxMBConv_iconv::GetMBNulLen() const
1943 {
1944 if ( m_minMBCharWidth == 0 )
1945 {
1946 wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1947
1948 #if wxUSE_THREADS
1949 // NB: explained in MB2WC
1950 wxMutexLocker lock(self->m_iconvMutex);
1951 #endif
1952
1953 wchar_t *wnul = L"";
1954 char buf[8]; // should be enough for NUL in any encoding
1955 size_t inLen = sizeof(wchar_t),
1956 outLen = WXSIZEOF(buf);
1957 char *inBuff = (char *)wnul;
1958 char *outBuff = buf;
1959 if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1960 {
1961 self->m_minMBCharWidth = (size_t)-1;
1962 }
1963 else // ok
1964 {
1965 self->m_minMBCharWidth = outBuff - buf;
1966 }
1967 }
1968
1969 return m_minMBCharWidth;
1970 }
1971
1972 #endif // HAVE_ICONV
1973
1974
1975 // ============================================================================
1976 // Win32 conversion classes
1977 // ============================================================================
1978
1979 #ifdef wxHAVE_WIN32_MB2WC
1980
1981 // from utils.cpp
1982 #if wxUSE_FONTMAP
1983 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1984 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1985 #endif
1986
1987 class wxMBConv_win32 : public wxMBConv
1988 {
1989 public:
1990 wxMBConv_win32()
1991 {
1992 m_CodePage = CP_ACP;
1993 m_minMBCharWidth = 0;
1994 }
1995
1996 wxMBConv_win32(const wxMBConv_win32& conv)
1997 {
1998 m_CodePage = conv.m_CodePage;
1999 m_minMBCharWidth = conv.m_minMBCharWidth;
2000 }
2001
2002 #if wxUSE_FONTMAP
2003 wxMBConv_win32(const wxChar* name)
2004 {
2005 m_CodePage = wxCharsetToCodepage(name);
2006 m_minMBCharWidth = 0;
2007 }
2008
2009 wxMBConv_win32(wxFontEncoding encoding)
2010 {
2011 m_CodePage = wxEncodingToCodepage(encoding);
2012 m_minMBCharWidth = 0;
2013 }
2014 #endif // wxUSE_FONTMAP
2015
2016 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2017 {
2018 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2019 // the behaviour is not compatible with the Unix version (using iconv)
2020 // and break the library itself, e.g. wxTextInputStream::NextChar()
2021 // wouldn't work if reading an incomplete MB char didn't result in an
2022 // error
2023 //
2024 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2025 // Win XP or newer and it is not supported for UTF-[78] so we always
2026 // use our own conversions in this case. See
2027 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2028 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2029 if ( m_CodePage == CP_UTF8 )
2030 {
2031 return wxConvUTF8.MB2WC(buf, psz, n);
2032 }
2033
2034 if ( m_CodePage == CP_UTF7 )
2035 {
2036 return wxConvUTF7.MB2WC(buf, psz, n);
2037 }
2038
2039 int flags = 0;
2040 if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2041 IsAtLeastWin2kSP4() )
2042 {
2043 flags = MB_ERR_INVALID_CHARS;
2044 }
2045
2046 const size_t len = ::MultiByteToWideChar
2047 (
2048 m_CodePage, // code page
2049 flags, // flags: fall on error
2050 psz, // input string
2051 -1, // its length (NUL-terminated)
2052 buf, // output string
2053 buf ? n : 0 // size of output buffer
2054 );
2055 if ( !len )
2056 {
2057 // function totally failed
2058 return wxCONV_FAILED;
2059 }
2060
2061 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2062 // check if we succeeded, by doing a double trip:
2063 if ( !flags && buf )
2064 {
2065 const size_t mbLen = strlen(psz);
2066 wxCharBuffer mbBuf(mbLen);
2067 if ( ::WideCharToMultiByte
2068 (
2069 m_CodePage,
2070 0,
2071 buf,
2072 -1,
2073 mbBuf.data(),
2074 mbLen + 1, // size in bytes, not length
2075 NULL,
2076 NULL
2077 ) == 0 ||
2078 strcmp(mbBuf, psz) != 0 )
2079 {
2080 // we didn't obtain the same thing we started from, hence
2081 // the conversion was lossy and we consider that it failed
2082 return wxCONV_FAILED;
2083 }
2084 }
2085
2086 // note that it returns count of written chars for buf != NULL and size
2087 // of the needed buffer for buf == NULL so in either case the length of
2088 // the string (which never includes the terminating NUL) is one less
2089 return len - 1;
2090 }
2091
2092 virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2093 {
2094 /*
2095 we have a problem here: by default, WideCharToMultiByte() may
2096 replace characters unrepresentable in the target code page with bad
2097 quality approximations such as turning "1/2" symbol (U+00BD) into
2098 "1" for the code pages which don't have it and we, obviously, want
2099 to avoid this at any price
2100
2101 the trouble is that this function does it _silently_, i.e. it won't
2102 even tell us whether it did or not... Win98/2000 and higher provide
2103 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2104 we have to resort to a round trip, i.e. check that converting back
2105 results in the same string -- this is, of course, expensive but
2106 otherwise we simply can't be sure to not garble the data.
2107 */
2108
2109 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2110 // it doesn't work with CJK encodings (which we test for rather roughly
2111 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2112 // supporting it
2113 BOOL usedDef wxDUMMY_INITIALIZE(false);
2114 BOOL *pUsedDef;
2115 int flags;
2116 if ( CanUseNoBestFit() && m_CodePage < 50000 )
2117 {
2118 // it's our lucky day
2119 flags = WC_NO_BEST_FIT_CHARS;
2120 pUsedDef = &usedDef;
2121 }
2122 else // old system or unsupported encoding
2123 {
2124 flags = 0;
2125 pUsedDef = NULL;
2126 }
2127
2128 const size_t len = ::WideCharToMultiByte
2129 (
2130 m_CodePage, // code page
2131 flags, // either none or no best fit
2132 pwz, // input string
2133 -1, // it is (wide) NUL-terminated
2134 buf, // output buffer
2135 buf ? n : 0, // and its size
2136 NULL, // default "replacement" char
2137 pUsedDef // [out] was it used?
2138 );
2139
2140 if ( !len )
2141 {
2142 // function totally failed
2143 return wxCONV_FAILED;
2144 }
2145
2146 // if we were really converting, check if we succeeded
2147 if ( buf )
2148 {
2149 if ( flags )
2150 {
2151 // check if the conversion failed, i.e. if any replacements
2152 // were done
2153 if ( usedDef )
2154 return wxCONV_FAILED;
2155 }
2156 else // we must resort to double tripping...
2157 {
2158 wxWCharBuffer wcBuf(n);
2159 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2160 wcscmp(wcBuf, pwz) != 0 )
2161 {
2162 // we didn't obtain the same thing we started from, hence
2163 // the conversion was lossy and we consider that it failed
2164 return wxCONV_FAILED;
2165 }
2166 }
2167 }
2168
2169 // see the comment above for the reason of "len - 1"
2170 return len - 1;
2171 }
2172
2173 virtual size_t GetMBNulLen() const
2174 {
2175 if ( m_minMBCharWidth == 0 )
2176 {
2177 int len = ::WideCharToMultiByte
2178 (
2179 m_CodePage, // code page
2180 0, // no flags
2181 L"", // input string
2182 1, // translate just the NUL
2183 NULL, // output buffer
2184 0, // and its size
2185 NULL, // no replacement char
2186 NULL // [out] don't care if it was used
2187 );
2188
2189 wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2190 switch ( len )
2191 {
2192 default:
2193 wxLogDebug(_T("Unexpected NUL length %d"), len);
2194 self->m_minMBCharWidth = (size_t)-1;
2195 break;
2196
2197 case 0:
2198 self->m_minMBCharWidth = (size_t)-1;
2199 break;
2200
2201 case 1:
2202 case 2:
2203 case 4:
2204 self->m_minMBCharWidth = len;
2205 break;
2206 }
2207 }
2208
2209 return m_minMBCharWidth;
2210 }
2211
2212 virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2213
2214 bool IsOk() const { return m_CodePage != -1; }
2215
2216 private:
2217 static bool CanUseNoBestFit()
2218 {
2219 static int s_isWin98Or2k = -1;
2220
2221 if ( s_isWin98Or2k == -1 )
2222 {
2223 int verMaj, verMin;
2224 switch ( wxGetOsVersion(&verMaj, &verMin) )
2225 {
2226 case wxWIN95:
2227 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2228 break;
2229
2230 case wxWINDOWS_NT:
2231 s_isWin98Or2k = verMaj >= 5;
2232 break;
2233
2234 default:
2235 // unknown: be conservative by default
2236 s_isWin98Or2k = 0;
2237 break;
2238 }
2239
2240 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2241 }
2242
2243 return s_isWin98Or2k == 1;
2244 }
2245
2246 static bool IsAtLeastWin2kSP4()
2247 {
2248 #ifdef __WXWINCE__
2249 return false;
2250 #else
2251 static int s_isAtLeastWin2kSP4 = -1;
2252
2253 if ( s_isAtLeastWin2kSP4 == -1 )
2254 {
2255 OSVERSIONINFOEX ver;
2256
2257 memset(&ver, 0, sizeof(ver));
2258 ver.dwOSVersionInfoSize = sizeof(ver);
2259 GetVersionEx((OSVERSIONINFO*)&ver);
2260
2261 s_isAtLeastWin2kSP4 =
2262 ((ver.dwMajorVersion > 5) || // Vista+
2263 (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2264 (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2265 ver.wServicePackMajor >= 4)) // 2000 SP4+
2266 ? 1 : 0;
2267 }
2268
2269 return s_isAtLeastWin2kSP4 == 1;
2270 #endif
2271 }
2272
2273
2274 // the code page we're working with
2275 long m_CodePage;
2276
2277 // cached result of GetMBNulLen(), set to 0 initially meaning
2278 // "unknown"
2279 size_t m_minMBCharWidth;
2280 };
2281
2282 #endif // wxHAVE_WIN32_MB2WC
2283
2284 // ============================================================================
2285 // Cocoa conversion classes
2286 // ============================================================================
2287
2288 #if defined(__WXCOCOA__)
2289
2290 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2291 // Strangely enough, internally Core Foundation uses
2292 // UTF-32 internally quite a bit - its just not public (yet).
2293
2294 #include <CoreFoundation/CFString.h>
2295 #include <CoreFoundation/CFStringEncodingExt.h>
2296
2297 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2298 {
2299 CFStringEncoding enc = kCFStringEncodingInvalidId ;
2300
2301 switch (encoding)
2302 {
2303 case wxFONTENCODING_DEFAULT :
2304 enc = CFStringGetSystemEncoding();
2305 break ;
2306
2307 case wxFONTENCODING_ISO8859_1 :
2308 enc = kCFStringEncodingISOLatin1 ;
2309 break ;
2310 case wxFONTENCODING_ISO8859_2 :
2311 enc = kCFStringEncodingISOLatin2;
2312 break ;
2313 case wxFONTENCODING_ISO8859_3 :
2314 enc = kCFStringEncodingISOLatin3 ;
2315 break ;
2316 case wxFONTENCODING_ISO8859_4 :
2317 enc = kCFStringEncodingISOLatin4;
2318 break ;
2319 case wxFONTENCODING_ISO8859_5 :
2320 enc = kCFStringEncodingISOLatinCyrillic;
2321 break ;
2322 case wxFONTENCODING_ISO8859_6 :
2323 enc = kCFStringEncodingISOLatinArabic;
2324 break ;
2325 case wxFONTENCODING_ISO8859_7 :
2326 enc = kCFStringEncodingISOLatinGreek;
2327 break ;
2328 case wxFONTENCODING_ISO8859_8 :
2329 enc = kCFStringEncodingISOLatinHebrew;
2330 break ;
2331 case wxFONTENCODING_ISO8859_9 :
2332 enc = kCFStringEncodingISOLatin5;
2333 break ;
2334 case wxFONTENCODING_ISO8859_10 :
2335 enc = kCFStringEncodingISOLatin6;
2336 break ;
2337 case wxFONTENCODING_ISO8859_11 :
2338 enc = kCFStringEncodingISOLatinThai;
2339 break ;
2340 case wxFONTENCODING_ISO8859_13 :
2341 enc = kCFStringEncodingISOLatin7;
2342 break ;
2343 case wxFONTENCODING_ISO8859_14 :
2344 enc = kCFStringEncodingISOLatin8;
2345 break ;
2346 case wxFONTENCODING_ISO8859_15 :
2347 enc = kCFStringEncodingISOLatin9;
2348 break ;
2349
2350 case wxFONTENCODING_KOI8 :
2351 enc = kCFStringEncodingKOI8_R;
2352 break ;
2353 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2354 enc = kCFStringEncodingDOSRussian;
2355 break ;
2356
2357 // case wxFONTENCODING_BULGARIAN :
2358 // enc = ;
2359 // break ;
2360
2361 case wxFONTENCODING_CP437 :
2362 enc = kCFStringEncodingDOSLatinUS ;
2363 break ;
2364 case wxFONTENCODING_CP850 :
2365 enc = kCFStringEncodingDOSLatin1;
2366 break ;
2367 case wxFONTENCODING_CP852 :
2368 enc = kCFStringEncodingDOSLatin2;
2369 break ;
2370 case wxFONTENCODING_CP855 :
2371 enc = kCFStringEncodingDOSCyrillic;
2372 break ;
2373 case wxFONTENCODING_CP866 :
2374 enc = kCFStringEncodingDOSRussian ;
2375 break ;
2376 case wxFONTENCODING_CP874 :
2377 enc = kCFStringEncodingDOSThai;
2378 break ;
2379 case wxFONTENCODING_CP932 :
2380 enc = kCFStringEncodingDOSJapanese;
2381 break ;
2382 case wxFONTENCODING_CP936 :
2383 enc = kCFStringEncodingDOSChineseSimplif ;
2384 break ;
2385 case wxFONTENCODING_CP949 :
2386 enc = kCFStringEncodingDOSKorean;
2387 break ;
2388 case wxFONTENCODING_CP950 :
2389 enc = kCFStringEncodingDOSChineseTrad;
2390 break ;
2391 case wxFONTENCODING_CP1250 :
2392 enc = kCFStringEncodingWindowsLatin2;
2393 break ;
2394 case wxFONTENCODING_CP1251 :
2395 enc = kCFStringEncodingWindowsCyrillic ;
2396 break ;
2397 case wxFONTENCODING_CP1252 :
2398 enc = kCFStringEncodingWindowsLatin1 ;
2399 break ;
2400 case wxFONTENCODING_CP1253 :
2401 enc = kCFStringEncodingWindowsGreek;
2402 break ;
2403 case wxFONTENCODING_CP1254 :
2404 enc = kCFStringEncodingWindowsLatin5;
2405 break ;
2406 case wxFONTENCODING_CP1255 :
2407 enc = kCFStringEncodingWindowsHebrew ;
2408 break ;
2409 case wxFONTENCODING_CP1256 :
2410 enc = kCFStringEncodingWindowsArabic ;
2411 break ;
2412 case wxFONTENCODING_CP1257 :
2413 enc = kCFStringEncodingWindowsBalticRim;
2414 break ;
2415 // This only really encodes to UTF7 (if that) evidently
2416 // case wxFONTENCODING_UTF7 :
2417 // enc = kCFStringEncodingNonLossyASCII ;
2418 // break ;
2419 case wxFONTENCODING_UTF8 :
2420 enc = kCFStringEncodingUTF8 ;
2421 break ;
2422 case wxFONTENCODING_EUC_JP :
2423 enc = kCFStringEncodingEUC_JP;
2424 break ;
2425 case wxFONTENCODING_UTF16 :
2426 enc = kCFStringEncodingUnicode ;
2427 break ;
2428 case wxFONTENCODING_MACROMAN :
2429 enc = kCFStringEncodingMacRoman ;
2430 break ;
2431 case wxFONTENCODING_MACJAPANESE :
2432 enc = kCFStringEncodingMacJapanese ;
2433 break ;
2434 case wxFONTENCODING_MACCHINESETRAD :
2435 enc = kCFStringEncodingMacChineseTrad ;
2436 break ;
2437 case wxFONTENCODING_MACKOREAN :
2438 enc = kCFStringEncodingMacKorean ;
2439 break ;
2440 case wxFONTENCODING_MACARABIC :
2441 enc = kCFStringEncodingMacArabic ;
2442 break ;
2443 case wxFONTENCODING_MACHEBREW :
2444 enc = kCFStringEncodingMacHebrew ;
2445 break ;
2446 case wxFONTENCODING_MACGREEK :
2447 enc = kCFStringEncodingMacGreek ;
2448 break ;
2449 case wxFONTENCODING_MACCYRILLIC :
2450 enc = kCFStringEncodingMacCyrillic ;
2451 break ;
2452 case wxFONTENCODING_MACDEVANAGARI :
2453 enc = kCFStringEncodingMacDevanagari ;
2454 break ;
2455 case wxFONTENCODING_MACGURMUKHI :
2456 enc = kCFStringEncodingMacGurmukhi ;
2457 break ;
2458 case wxFONTENCODING_MACGUJARATI :
2459 enc = kCFStringEncodingMacGujarati ;
2460 break ;
2461 case wxFONTENCODING_MACORIYA :
2462 enc = kCFStringEncodingMacOriya ;
2463 break ;
2464 case wxFONTENCODING_MACBENGALI :
2465 enc = kCFStringEncodingMacBengali ;
2466 break ;
2467 case wxFONTENCODING_MACTAMIL :
2468 enc = kCFStringEncodingMacTamil ;
2469 break ;
2470 case wxFONTENCODING_MACTELUGU :
2471 enc = kCFStringEncodingMacTelugu ;
2472 break ;
2473 case wxFONTENCODING_MACKANNADA :
2474 enc = kCFStringEncodingMacKannada ;
2475 break ;
2476 case wxFONTENCODING_MACMALAJALAM :
2477 enc = kCFStringEncodingMacMalayalam ;
2478 break ;
2479 case wxFONTENCODING_MACSINHALESE :
2480 enc = kCFStringEncodingMacSinhalese ;
2481 break ;
2482 case wxFONTENCODING_MACBURMESE :
2483 enc = kCFStringEncodingMacBurmese ;
2484 break ;
2485 case wxFONTENCODING_MACKHMER :
2486 enc = kCFStringEncodingMacKhmer ;
2487 break ;
2488 case wxFONTENCODING_MACTHAI :
2489 enc = kCFStringEncodingMacThai ;
2490 break ;
2491 case wxFONTENCODING_MACLAOTIAN :
2492 enc = kCFStringEncodingMacLaotian ;
2493 break ;
2494 case wxFONTENCODING_MACGEORGIAN :
2495 enc = kCFStringEncodingMacGeorgian ;
2496 break ;
2497 case wxFONTENCODING_MACARMENIAN :
2498 enc = kCFStringEncodingMacArmenian ;
2499 break ;
2500 case wxFONTENCODING_MACCHINESESIMP :
2501 enc = kCFStringEncodingMacChineseSimp ;
2502 break ;
2503 case wxFONTENCODING_MACTIBETAN :
2504 enc = kCFStringEncodingMacTibetan ;
2505 break ;
2506 case wxFONTENCODING_MACMONGOLIAN :
2507 enc = kCFStringEncodingMacMongolian ;
2508 break ;
2509 case wxFONTENCODING_MACETHIOPIC :
2510 enc = kCFStringEncodingMacEthiopic ;
2511 break ;
2512 case wxFONTENCODING_MACCENTRALEUR :
2513 enc = kCFStringEncodingMacCentralEurRoman ;
2514 break ;
2515 case wxFONTENCODING_MACVIATNAMESE :
2516 enc = kCFStringEncodingMacVietnamese ;
2517 break ;
2518 case wxFONTENCODING_MACARABICEXT :
2519 enc = kCFStringEncodingMacExtArabic ;
2520 break ;
2521 case wxFONTENCODING_MACSYMBOL :
2522 enc = kCFStringEncodingMacSymbol ;
2523 break ;
2524 case wxFONTENCODING_MACDINGBATS :
2525 enc = kCFStringEncodingMacDingbats ;
2526 break ;
2527 case wxFONTENCODING_MACTURKISH :
2528 enc = kCFStringEncodingMacTurkish ;
2529 break ;
2530 case wxFONTENCODING_MACCROATIAN :
2531 enc = kCFStringEncodingMacCroatian ;
2532 break ;
2533 case wxFONTENCODING_MACICELANDIC :
2534 enc = kCFStringEncodingMacIcelandic ;
2535 break ;
2536 case wxFONTENCODING_MACROMANIAN :
2537 enc = kCFStringEncodingMacRomanian ;
2538 break ;
2539 case wxFONTENCODING_MACCELTIC :
2540 enc = kCFStringEncodingMacCeltic ;
2541 break ;
2542 case wxFONTENCODING_MACGAELIC :
2543 enc = kCFStringEncodingMacGaelic ;
2544 break ;
2545 // case wxFONTENCODING_MACKEYBOARD :
2546 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2547 // break ;
2548
2549 default :
2550 // because gcc is picky
2551 break ;
2552 }
2553
2554 return enc ;
2555 }
2556
2557 class wxMBConv_cocoa : public wxMBConv
2558 {
2559 public:
2560 wxMBConv_cocoa()
2561 {
2562 Init(CFStringGetSystemEncoding()) ;
2563 }
2564
2565 wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2566 {
2567 m_encoding = conv.m_encoding;
2568 }
2569
2570 #if wxUSE_FONTMAP
2571 wxMBConv_cocoa(const wxChar* name)
2572 {
2573 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2574 }
2575 #endif
2576
2577 wxMBConv_cocoa(wxFontEncoding encoding)
2578 {
2579 Init( wxCFStringEncFromFontEnc(encoding) );
2580 }
2581
2582 ~wxMBConv_cocoa()
2583 {
2584 }
2585
2586 void Init( CFStringEncoding encoding)
2587 {
2588 m_encoding = encoding ;
2589 }
2590
2591 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2592 {
2593 wxASSERT(szUnConv);
2594
2595 CFStringRef theString = CFStringCreateWithBytes (
2596 NULL, //the allocator
2597 (const UInt8*)szUnConv,
2598 strlen(szUnConv),
2599 m_encoding,
2600 false //no BOM/external representation
2601 );
2602
2603 wxASSERT(theString);
2604
2605 size_t nOutLength = CFStringGetLength(theString);
2606
2607 if (szOut == NULL)
2608 {
2609 CFRelease(theString);
2610 return nOutLength;
2611 }
2612
2613 CFRange theRange = { 0, nOutSize };
2614
2615 #if SIZEOF_WCHAR_T == 4
2616 UniChar* szUniCharBuffer = new UniChar[nOutSize];
2617 #endif
2618
2619 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2620
2621 CFRelease(theString);
2622
2623 szUniCharBuffer[nOutLength] = '\0';
2624
2625 #if SIZEOF_WCHAR_T == 4
2626 wxMBConvUTF16 converter;
2627 converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2628 delete [] szUniCharBuffer;
2629 #endif
2630
2631 return nOutLength;
2632 }
2633
2634 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2635 {
2636 wxASSERT(szUnConv);
2637
2638 size_t nRealOutSize;
2639 size_t nBufSize = wxWcslen(szUnConv);
2640 UniChar* szUniBuffer = (UniChar*) szUnConv;
2641
2642 #if SIZEOF_WCHAR_T == 4
2643 wxMBConvUTF16 converter ;
2644 nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2645 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2646 converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2647 nBufSize /= sizeof(UniChar);
2648 #endif
2649
2650 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2651 NULL, //allocator
2652 szUniBuffer,
2653 nBufSize,
2654 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2655 );
2656
2657 wxASSERT(theString);
2658
2659 //Note that CER puts a BOM when converting to unicode
2660 //so we check and use getchars instead in that case
2661 if (m_encoding == kCFStringEncodingUnicode)
2662 {
2663 if (szOut != NULL)
2664 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2665
2666 nRealOutSize = CFStringGetLength(theString) + 1;
2667 }
2668 else
2669 {
2670 CFStringGetBytes(
2671 theString,
2672 CFRangeMake(0, CFStringGetLength(theString)),
2673 m_encoding,
2674 0, //what to put in characters that can't be converted -
2675 //0 tells CFString to return NULL if it meets such a character
2676 false, //not an external representation
2677 (UInt8*) szOut,
2678 nOutSize,
2679 (CFIndex*) &nRealOutSize
2680 );
2681 }
2682
2683 CFRelease(theString);
2684
2685 #if SIZEOF_WCHAR_T == 4
2686 delete[] szUniBuffer;
2687 #endif
2688
2689 return nRealOutSize - 1;
2690 }
2691
2692 virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2693
2694 bool IsOk() const
2695 {
2696 return m_encoding != kCFStringEncodingInvalidId &&
2697 CFStringIsEncodingAvailable(m_encoding);
2698 }
2699
2700 private:
2701 CFStringEncoding m_encoding ;
2702 };
2703
2704 #endif // defined(__WXCOCOA__)
2705
2706 // ============================================================================
2707 // Mac conversion classes
2708 // ============================================================================
2709
2710 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2711
2712 class wxMBConv_mac : public wxMBConv
2713 {
2714 public:
2715 wxMBConv_mac()
2716 {
2717 Init(CFStringGetSystemEncoding()) ;
2718 }
2719
2720 wxMBConv_mac(const wxMBConv_mac& conv)
2721 {
2722 Init(conv.m_char_encoding);
2723 }
2724
2725 #if wxUSE_FONTMAP
2726 wxMBConv_mac(const wxChar* name)
2727 {
2728 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2729 }
2730 #endif
2731
2732 wxMBConv_mac(wxFontEncoding encoding)
2733 {
2734 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2735 }
2736
2737 ~wxMBConv_mac()
2738 {
2739 OSStatus status = noErr ;
2740 status = TECDisposeConverter(m_MB2WC_converter);
2741 status = TECDisposeConverter(m_WC2MB_converter);
2742 }
2743
2744
2745 void Init( TextEncodingBase encoding)
2746 {
2747 OSStatus status = noErr ;
2748 m_char_encoding = encoding ;
2749 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2750
2751 status = TECCreateConverter(&m_MB2WC_converter,
2752 m_char_encoding,
2753 m_unicode_encoding);
2754 status = TECCreateConverter(&m_WC2MB_converter,
2755 m_unicode_encoding,
2756 m_char_encoding);
2757 }
2758
2759 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2760 {
2761 OSStatus status = noErr ;
2762 ByteCount byteOutLen ;
2763 ByteCount byteInLen = strlen(psz) + 1;
2764 wchar_t *tbuf = NULL ;
2765 UniChar* ubuf = NULL ;
2766 size_t res = 0 ;
2767
2768 if (buf == NULL)
2769 {
2770 // Apple specs say at least 32
2771 n = wxMax( 32, byteInLen ) ;
2772 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2773 }
2774
2775 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2776
2777 #if SIZEOF_WCHAR_T == 4
2778 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2779 #else
2780 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2781 #endif
2782
2783 status = TECConvertText(
2784 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2785 (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2786
2787 #if SIZEOF_WCHAR_T == 4
2788 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2789 // is not properly terminated we get random characters at the end
2790 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2791 wxMBConvUTF16 converter ;
2792 res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2793 free( ubuf ) ;
2794 #else
2795 res = byteOutLen / sizeof( UniChar ) ;
2796 #endif
2797
2798 if ( buf == NULL )
2799 free(tbuf) ;
2800
2801 if ( buf && res < n)
2802 buf[res] = 0;
2803
2804 return res ;
2805 }
2806
2807 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2808 {
2809 OSStatus status = noErr ;
2810 ByteCount byteOutLen ;
2811 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2812
2813 char *tbuf = NULL ;
2814
2815 if (buf == NULL)
2816 {
2817 // Apple specs say at least 32
2818 n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2819 tbuf = (char*) malloc( n ) ;
2820 }
2821
2822 ByteCount byteBufferLen = n ;
2823 UniChar* ubuf = NULL ;
2824
2825 #if SIZEOF_WCHAR_T == 4
2826 wxMBConvUTF16 converter ;
2827 size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2828 byteInLen = unicharlen ;
2829 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2830 converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2831 #else
2832 ubuf = (UniChar*) psz ;
2833 #endif
2834
2835 status = TECConvertText(
2836 m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2837 (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2838
2839 #if SIZEOF_WCHAR_T == 4
2840 free( ubuf ) ;
2841 #endif
2842
2843 if ( buf == NULL )
2844 free(tbuf) ;
2845
2846 size_t res = byteOutLen ;
2847 if ( buf && res < n)
2848 {
2849 buf[res] = 0;
2850
2851 //we need to double-trip to verify it didn't insert any ? in place
2852 //of bogus characters
2853 wxWCharBuffer wcBuf(n);
2854 size_t pszlen = wxWcslen(psz);
2855 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2856 wxWcslen(wcBuf) != pszlen ||
2857 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2858 {
2859 // we didn't obtain the same thing we started from, hence
2860 // the conversion was lossy and we consider that it failed
2861 return wxCONV_FAILED;
2862 }
2863 }
2864
2865 return res ;
2866 }
2867
2868 virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2869
2870 bool IsOk() const
2871 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2872
2873 private:
2874 TECObjectRef m_MB2WC_converter;
2875 TECObjectRef m_WC2MB_converter;
2876
2877 TextEncodingBase m_char_encoding;
2878 TextEncodingBase m_unicode_encoding;
2879 };
2880
2881 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2882
2883 // ============================================================================
2884 // wxEncodingConverter based conversion classes
2885 // ============================================================================
2886
2887 #if wxUSE_FONTMAP
2888
2889 class wxMBConv_wxwin : public wxMBConv
2890 {
2891 private:
2892 void Init()
2893 {
2894 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2895 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2896 }
2897
2898 public:
2899 // temporarily just use wxEncodingConverter stuff,
2900 // so that it works while a better implementation is built
2901 wxMBConv_wxwin(const wxChar* name)
2902 {
2903 if (name)
2904 m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2905 else
2906 m_enc = wxFONTENCODING_SYSTEM;
2907
2908 Init();
2909 }
2910
2911 wxMBConv_wxwin(wxFontEncoding enc)
2912 {
2913 m_enc = enc;
2914
2915 Init();
2916 }
2917
2918 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2919 {
2920 size_t inbuf = strlen(psz);
2921 if (buf)
2922 {
2923 if (!m2w.Convert(psz, buf))
2924 return wxCONV_FAILED;
2925 }
2926 return inbuf;
2927 }
2928
2929 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2930 {
2931 const size_t inbuf = wxWcslen(psz);
2932 if (buf)
2933 {
2934 if (!w2m.Convert(psz, buf))
2935 return wxCONV_FAILED;
2936 }
2937
2938 return inbuf;
2939 }
2940
2941 virtual size_t GetMBNulLen() const
2942 {
2943 switch ( m_enc )
2944 {
2945 case wxFONTENCODING_UTF16BE:
2946 case wxFONTENCODING_UTF16LE:
2947 return 2;
2948
2949 case wxFONTENCODING_UTF32BE:
2950 case wxFONTENCODING_UTF32LE:
2951 return 4;
2952
2953 default:
2954 return 1;
2955 }
2956 }
2957
2958 virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2959
2960 bool IsOk() const { return m_ok; }
2961
2962 public:
2963 wxFontEncoding m_enc;
2964 wxEncodingConverter m2w, w2m;
2965
2966 private:
2967 // were we initialized successfully?
2968 bool m_ok;
2969
2970 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2971 };
2972
2973 // make the constructors available for unit testing
2974 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2975 {
2976 wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2977 if ( !result->IsOk() )
2978 {
2979 delete result;
2980 return 0;
2981 }
2982
2983 return result;
2984 }
2985
2986 #endif // wxUSE_FONTMAP
2987
2988 // ============================================================================
2989 // wxCSConv implementation
2990 // ============================================================================
2991
2992 void wxCSConv::Init()
2993 {
2994 m_name = NULL;
2995 m_convReal = NULL;
2996 m_deferred = true;
2997 }
2998
2999 wxCSConv::wxCSConv(const wxChar *charset)
3000 {
3001 Init();
3002
3003 if ( charset )
3004 {
3005 SetName(charset);
3006 }
3007
3008 #if wxUSE_FONTMAP
3009 m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3010 #else
3011 m_encoding = wxFONTENCODING_SYSTEM;
3012 #endif
3013 }
3014
3015 wxCSConv::wxCSConv(wxFontEncoding encoding)
3016 {
3017 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3018 {
3019 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3020
3021 encoding = wxFONTENCODING_SYSTEM;
3022 }
3023
3024 Init();
3025
3026 m_encoding = encoding;
3027 }
3028
3029 wxCSConv::~wxCSConv()
3030 {
3031 Clear();
3032 }
3033
3034 wxCSConv::wxCSConv(const wxCSConv& conv)
3035 : wxMBConv()
3036 {
3037 Init();
3038
3039 SetName(conv.m_name);
3040 m_encoding = conv.m_encoding;
3041 }
3042
3043 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3044 {
3045 Clear();
3046
3047 SetName(conv.m_name);
3048 m_encoding = conv.m_encoding;
3049
3050 return *this;
3051 }
3052
3053 void wxCSConv::Clear()
3054 {
3055 free(m_name);
3056 delete m_convReal;
3057
3058 m_name = NULL;
3059 m_convReal = NULL;
3060 }
3061
3062 void wxCSConv::SetName(const wxChar *charset)
3063 {
3064 if (charset)
3065 {
3066 m_name = wxStrdup(charset);
3067 m_deferred = true;
3068 }
3069 }
3070
3071 #if wxUSE_FONTMAP
3072 #include "wx/hashmap.h"
3073
3074 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3075 wxEncodingNameCache );
3076
3077 static wxEncodingNameCache gs_nameCache;
3078 #endif
3079
3080 wxMBConv *wxCSConv::DoCreate() const
3081 {
3082 #if wxUSE_FONTMAP
3083 wxLogTrace(TRACE_STRCONV,
3084 wxT("creating conversion for %s"),
3085 (m_name ? m_name
3086 : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3087 #endif // wxUSE_FONTMAP
3088
3089 // check for the special case of ASCII or ISO8859-1 charset: as we have
3090 // special knowledge of it anyhow, we don't need to create a special
3091 // conversion object
3092 if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3093 m_encoding == wxFONTENCODING_DEFAULT )
3094 {
3095 // don't convert at all
3096 return NULL;
3097 }
3098
3099 // we trust OS to do conversion better than we can so try external
3100 // conversion methods first
3101 //
3102 // the full order is:
3103 // 1. OS conversion (iconv() under Unix or Win32 API)
3104 // 2. hard coded conversions for UTF
3105 // 3. wxEncodingConverter as fall back
3106
3107 // step (1)
3108 #ifdef HAVE_ICONV
3109 #if !wxUSE_FONTMAP
3110 if ( m_name )
3111 #endif // !wxUSE_FONTMAP
3112 {
3113 wxString name(m_name);
3114 wxFontEncoding encoding(m_encoding);
3115
3116 if ( !name.empty() )
3117 {
3118 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3119 if ( conv->IsOk() )
3120 return conv;
3121
3122 delete conv;
3123
3124 #if wxUSE_FONTMAP
3125 encoding =
3126 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3127 #endif // wxUSE_FONTMAP
3128 }
3129 #if wxUSE_FONTMAP
3130 {
3131 const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3132 if ( it != gs_nameCache.end() )
3133 {
3134 if ( it->second.empty() )
3135 return NULL;
3136
3137 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3138 if ( conv->IsOk() )
3139 return conv;
3140
3141 delete conv;
3142 }
3143
3144 const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3145
3146 for ( ; *names; ++names )
3147 {
3148 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3149 if ( conv->IsOk() )
3150 {
3151 gs_nameCache[encoding] = *names;
3152 return conv;
3153 }
3154
3155 delete conv;
3156 }
3157
3158 gs_nameCache[encoding] = _T(""); // cache the failure
3159 }
3160 #endif // wxUSE_FONTMAP
3161 }
3162 #endif // HAVE_ICONV
3163
3164 #ifdef wxHAVE_WIN32_MB2WC
3165 {
3166 #if wxUSE_FONTMAP
3167 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3168 : new wxMBConv_win32(m_encoding);
3169 if ( conv->IsOk() )
3170 return conv;
3171
3172 delete conv;
3173 #else
3174 return NULL;
3175 #endif
3176 }
3177 #endif // wxHAVE_WIN32_MB2WC
3178
3179 #if defined(__WXMAC__)
3180 {
3181 // leave UTF16 and UTF32 to the built-ins of wx
3182 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3183 ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3184 {
3185 #if wxUSE_FONTMAP
3186 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3187 : new wxMBConv_mac(m_encoding);
3188 #else
3189 wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3190 #endif
3191 if ( conv->IsOk() )
3192 return conv;
3193
3194 delete conv;
3195 }
3196 }
3197 #endif
3198
3199 #if defined(__WXCOCOA__)
3200 {
3201 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3202 {
3203 #if wxUSE_FONTMAP
3204 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3205 : new wxMBConv_cocoa(m_encoding);
3206 #else
3207 wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3208 #endif
3209
3210 if ( conv->IsOk() )
3211 return conv;
3212
3213 delete conv;
3214 }
3215 }
3216 #endif
3217 // step (2)
3218 wxFontEncoding enc = m_encoding;
3219 #if wxUSE_FONTMAP
3220 if ( enc == wxFONTENCODING_SYSTEM && m_name )
3221 {
3222 // use "false" to suppress interactive dialogs -- we can be called from
3223 // anywhere and popping up a dialog from here is the last thing we want to
3224 // do
3225 enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3226 }
3227 #endif // wxUSE_FONTMAP
3228
3229 switch ( enc )
3230 {
3231 case wxFONTENCODING_UTF7:
3232 return new wxMBConvUTF7;
3233
3234 case wxFONTENCODING_UTF8:
3235 return new wxMBConvUTF8;
3236
3237 case wxFONTENCODING_UTF16BE:
3238 return new wxMBConvUTF16BE;
3239
3240 case wxFONTENCODING_UTF16LE:
3241 return new wxMBConvUTF16LE;
3242
3243 case wxFONTENCODING_UTF32BE:
3244 return new wxMBConvUTF32BE;
3245
3246 case wxFONTENCODING_UTF32LE:
3247 return new wxMBConvUTF32LE;
3248
3249 default:
3250 // nothing to do but put here to suppress gcc warnings
3251 break;
3252 }
3253
3254 // step (3)
3255 #if wxUSE_FONTMAP
3256 {
3257 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3258 : new wxMBConv_wxwin(m_encoding);
3259 if ( conv->IsOk() )
3260 return conv;
3261
3262 delete conv;
3263 }
3264 #endif // wxUSE_FONTMAP
3265
3266 // NB: This is a hack to prevent deadlock. What could otherwise happen
3267 // in Unicode build: wxConvLocal creation ends up being here
3268 // because of some failure and logs the error. But wxLog will try to
3269 // attach timestamp, for which it will need wxConvLocal (to convert
3270 // time to char* and then wchar_t*), but that fails, tries to log
3271 // error, but wxLog has a (already locked) critical section that
3272 // guards static buffer.
3273 static bool alreadyLoggingError = false;
3274 if (!alreadyLoggingError)
3275 {
3276 alreadyLoggingError = true;
3277 wxLogError(_("Cannot convert from the charset '%s'!"),
3278 m_name ? m_name
3279 :
3280 #if wxUSE_FONTMAP
3281 wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3282 #else // !wxUSE_FONTMAP
3283 wxString::Format(_("encoding %s"), m_encoding).c_str()
3284 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3285 );
3286
3287 alreadyLoggingError = false;
3288 }
3289
3290 return NULL;
3291 }
3292
3293 void wxCSConv::CreateConvIfNeeded() const
3294 {
3295 if ( m_deferred )
3296 {
3297 wxCSConv *self = (wxCSConv *)this; // const_cast
3298
3299 #if wxUSE_INTL
3300 // if we don't have neither the name nor the encoding, use the default
3301 // encoding for this system
3302 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3303 {
3304 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3305 }
3306 #endif // wxUSE_INTL
3307
3308 self->m_convReal = DoCreate();
3309 self->m_deferred = false;
3310 }
3311 }
3312
3313 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3314 {
3315 CreateConvIfNeeded();
3316
3317 if (m_convReal)
3318 return m_convReal->MB2WC(buf, psz, n);
3319
3320 // latin-1 (direct)
3321 size_t len = strlen(psz);
3322
3323 if (buf)
3324 {
3325 for (size_t c = 0; c <= len; c++)
3326 buf[c] = (unsigned char)(psz[c]);
3327 }
3328
3329 return len;
3330 }
3331
3332 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3333 {
3334 CreateConvIfNeeded();
3335
3336 if (m_convReal)
3337 return m_convReal->WC2MB(buf, psz, n);
3338
3339 // latin-1 (direct)
3340 const size_t len = wxWcslen(psz);
3341 if (buf)
3342 {
3343 for (size_t c = 0; c <= len; c++)
3344 {
3345 if (psz[c] > 0xFF)
3346 return wxCONV_FAILED;
3347
3348 buf[c] = (char)psz[c];
3349 }
3350 }
3351 else
3352 {
3353 for (size_t c = 0; c <= len; c++)
3354 {
3355 if (psz[c] > 0xFF)
3356 return wxCONV_FAILED;
3357 }
3358 }
3359
3360 return len;
3361 }
3362
3363 size_t wxCSConv::GetMBNulLen() const
3364 {
3365 CreateConvIfNeeded();
3366
3367 if ( m_convReal )
3368 {
3369 return m_convReal->GetMBNulLen();
3370 }
3371
3372 return 1;
3373 }
3374
3375 // ----------------------------------------------------------------------------
3376 // globals
3377 // ----------------------------------------------------------------------------
3378
3379 #ifdef __WINDOWS__
3380 static wxMBConv_win32 wxConvLibcObj;
3381 #elif defined(__WXMAC__) && !defined(__MACH__)
3382 static wxMBConv_mac wxConvLibcObj ;
3383 #else
3384 static wxMBConvLibc wxConvLibcObj;
3385 #endif
3386
3387 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3388 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3389 static wxMBConvUTF7 wxConvUTF7Obj;
3390 static wxMBConvUTF8 wxConvUTF8Obj;
3391
3392 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3393 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3394 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3395 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3396 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3397 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3398 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3399 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3400 #ifdef __WXOSX__
3401 wxConvUTF8Obj;
3402 #else
3403 wxConvLibcObj;
3404 #endif
3405
3406 #else // !wxUSE_WCHAR_T
3407
3408 // stand-ins in absence of wchar_t
3409 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3410 wxConvISO8859_1,
3411 wxConvLocal,
3412 wxConvUTF8;
3413
3414 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T