]> git.saurik.com Git - wxWidgets.git/blob - src/common/strconv.cpp
UNICODE-capatable UTF8 implementation of wxStringXXXStream
[wxWidgets.git] / src / common / strconv.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 // Name: strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
6 // Modified by:
7 // Created: 29/01/98
8 // RCS-ID: $Id$
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
14
15 // ============================================================================
16 // declarations
17 // ============================================================================
18
19 // ----------------------------------------------------------------------------
20 // headers
21 // ----------------------------------------------------------------------------
22
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
25 #endif
26
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
29
30 #ifdef __BORLANDC__
31 #pragma hdrstop
32 #endif
33
34 #ifndef WX_PRECOMP
35 #include "wx/intl.h"
36 #include "wx/log.h"
37 #endif // WX_PRECOMP
38
39 #include "wx/strconv.h"
40
41 #if wxUSE_WCHAR_T
42
43 #ifdef __WXMSW__
44 #include "wx/msw/private.h"
45 #endif
46
47 #ifdef __WINDOWS__
48 #include "wx/msw/missing.h"
49 #endif
50
51 #ifndef __WXWINCE__
52 #include <errno.h>
53 #endif
54
55 #include <ctype.h>
56 #include <string.h>
57 #include <stdlib.h>
58
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
62
63 // ----------------------------------------------------------------------------
64 // headers
65 // ----------------------------------------------------------------------------
66
67 #ifdef __SALFORDC__
68 #include <clib.h>
69 #endif
70
71 #ifdef HAVE_ICONV
72 #include <iconv.h>
73 #include "wx/thread.h"
74 #endif
75
76 #include "wx/encconv.h"
77 #include "wx/fontmap.h"
78 #include "wx/utils.h"
79
80 #ifdef __WXMAC__
81 #include <ATSUnicode.h>
82 #include <TextCommon.h>
83 #include <TextEncodingConverter.h>
84
85 #include "wx/mac/private.h" // includes mac headers
86 #endif
87 // ----------------------------------------------------------------------------
88 // macros
89 // ----------------------------------------------------------------------------
90
91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
93
94 #if SIZEOF_WCHAR_T == 4
95 #define WC_NAME "UCS4"
96 #define WC_BSWAP BSWAP_UCS4
97 #ifdef WORDS_BIGENDIAN
98 #define WC_NAME_BEST "UCS-4BE"
99 #else
100 #define WC_NAME_BEST "UCS-4LE"
101 #endif
102 #elif SIZEOF_WCHAR_T == 2
103 #define WC_NAME "UTF16"
104 #define WC_BSWAP BSWAP_UTF16
105 #define WC_UTF16
106 #ifdef WORDS_BIGENDIAN
107 #define WC_NAME_BEST "UTF-16BE"
108 #else
109 #define WC_NAME_BEST "UTF-16LE"
110 #endif
111 #else // sizeof(wchar_t) != 2 nor 4
112 // does this ever happen?
113 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
114 #endif
115
116 // ============================================================================
117 // implementation
118 // ============================================================================
119
120 // ----------------------------------------------------------------------------
121 // UTF-16 en/decoding to/from UCS-4
122 // ----------------------------------------------------------------------------
123
124
125 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
126 {
127 if (input<=0xffff)
128 {
129 if (output)
130 *output = (wxUint16) input;
131 return 1;
132 }
133 else if (input>=0x110000)
134 {
135 return (size_t)-1;
136 }
137 else
138 {
139 if (output)
140 {
141 *output++ = (wxUint16) ((input >> 10)+0xd7c0);
142 *output = (wxUint16) ((input&0x3ff)+0xdc00);
143 }
144 return 2;
145 }
146 }
147
148 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
149 {
150 if ((*input<0xd800) || (*input>0xdfff))
151 {
152 output = *input;
153 return 1;
154 }
155 else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
156 {
157 output = *input;
158 return (size_t)-1;
159 }
160 else
161 {
162 output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
163 return 2;
164 }
165 }
166
167
168 // ----------------------------------------------------------------------------
169 // wxMBConv
170 // ----------------------------------------------------------------------------
171
172 wxMBConv::~wxMBConv()
173 {
174 // nothing to do here (necessary for Darwin linking probably)
175 }
176
177 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
178 {
179 if ( psz )
180 {
181 // calculate the length of the buffer needed first
182 size_t nLen = MB2WC(NULL, psz, 0);
183 if ( nLen != (size_t)-1 )
184 {
185 // now do the actual conversion
186 wxWCharBuffer buf(nLen);
187 nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
188 if ( nLen != (size_t)-1 )
189 {
190 return buf;
191 }
192 }
193 }
194
195 wxWCharBuffer buf((wchar_t *)NULL);
196
197 return buf;
198 }
199
200 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
201 {
202 if ( pwz )
203 {
204 size_t nLen = WC2MB(NULL, pwz, 0);
205 if ( nLen != (size_t)-1 )
206 {
207 wxCharBuffer buf(nLen+3); // space for a wxUint32 trailing zero
208 nLen = WC2MB(buf.data(), pwz, nLen + 4);
209 if ( nLen != (size_t)-1 )
210 {
211 return buf;
212 }
213 }
214 }
215
216 wxCharBuffer buf((char *)NULL);
217
218 return buf;
219 }
220
221 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
222 {
223 wxASSERT(pOutSize != NULL);
224
225 const char* szEnd = szString + nStringLen + 1;
226 const char* szPos = szString;
227 const char* szStart = szPos;
228
229 size_t nActualLength = 0;
230 size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
231
232 wxWCharBuffer theBuffer(nCurrentSize);
233
234 //Convert the string until the length() is reached, continuing the
235 //loop every time a null character is reached
236 while(szPos != szEnd)
237 {
238 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
239
240 //Get the length of the current (sub)string
241 size_t nLen = MB2WC(NULL, szPos, 0);
242
243 //Invalid conversion?
244 if( nLen == (size_t)-1 )
245 {
246 *pOutSize = 0;
247 theBuffer.data()[0u] = wxT('\0');
248 return theBuffer;
249 }
250
251
252 //Increase the actual length (+1 for current null character)
253 nActualLength += nLen + 1;
254
255 //if buffer too big, realloc the buffer
256 if (nActualLength > (nCurrentSize+1))
257 {
258 wxWCharBuffer theNewBuffer(nCurrentSize << 1);
259 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
260 theBuffer = theNewBuffer;
261 nCurrentSize <<= 1;
262 }
263
264 //Convert the current (sub)string
265 if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
266 {
267 *pOutSize = 0;
268 theBuffer.data()[0u] = wxT('\0');
269 return theBuffer;
270 }
271
272 //Increment to next (sub)string
273 //Note that we have to use strlen here instead of nLen
274 //here because XX2XX gives us the size of the output buffer,
275 //not neccessarly the length of the string
276 szPos += strlen(szPos) + 1;
277 }
278
279 //success - return actual length and the buffer
280 *pOutSize = nActualLength;
281 return theBuffer;
282 }
283
284 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
285 {
286 wxASSERT(pOutSize != NULL);
287
288 const wchar_t* szEnd = szString + nStringLen + 1;
289 const wchar_t* szPos = szString;
290 const wchar_t* szStart = szPos;
291
292 size_t nActualLength = 0;
293 size_t nCurrentSize = nStringLen << 2; //try * 4 first
294
295 wxCharBuffer theBuffer(nCurrentSize);
296
297 //Convert the string until the length() is reached, continuing the
298 //loop every time a null character is reached
299 while(szPos != szEnd)
300 {
301 wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
302
303 //Get the length of the current (sub)string
304 size_t nLen = WC2MB(NULL, szPos, 0);
305
306 //Invalid conversion?
307 if( nLen == (size_t)-1 )
308 {
309 *pOutSize = 0;
310 theBuffer.data()[0u] = wxT('\0');
311 return theBuffer;
312 }
313
314 //Increase the actual length (+1 for current null character)
315 nActualLength += nLen + 1;
316
317 //if buffer too big, realloc the buffer
318 if (nActualLength > (nCurrentSize+1))
319 {
320 wxCharBuffer theNewBuffer(nCurrentSize << 1);
321 memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
322 theBuffer = theNewBuffer;
323 nCurrentSize <<= 1;
324 }
325
326 //Convert the current (sub)string
327 if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
328 {
329 *pOutSize = 0;
330 theBuffer.data()[0u] = wxT('\0');
331 return theBuffer;
332 }
333
334 //Increment to next (sub)string
335 //Note that we have to use wxWcslen here instead of nLen
336 //here because XX2XX gives us the size of the output buffer,
337 //not neccessarly the length of the string
338 szPos += wxWcslen(szPos) + 1;
339 }
340
341 //success - return actual length and the buffer
342 *pOutSize = nActualLength;
343 return theBuffer;
344 }
345
346 // ----------------------------------------------------------------------------
347 // wxMBConvLibc
348 // ----------------------------------------------------------------------------
349
350 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
351 {
352 return wxMB2WC(buf, psz, n);
353 }
354
355 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
356 {
357 return wxWC2MB(buf, psz, n);
358 }
359 // ----------------------------------------------------------------------------
360 // UTF-7
361 // ----------------------------------------------------------------------------
362
363 // Implementation (C) 2004 Fredrik Roubert
364
365 //
366 // BASE64 decoding table
367 //
368 static const unsigned char utf7unb64[] =
369 {
370 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
371 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
372 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
375 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
376 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
377 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
378 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
379 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
380 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
381 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
382 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
383 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
384 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
385 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
386 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
387 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
388 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
401 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
402 };
403
404 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
405 {
406
407 size_t len = 0;
408
409 while (*psz && ((!buf) || (len < n)))
410 {
411 unsigned char cc = *psz++;
412 if (cc != '+')
413 {
414 // plain ASCII char
415 if (buf)
416 *buf++ = cc;
417 len++;
418 }
419 else if (*psz == '-')
420 {
421 // encoded plus sign
422 if (buf)
423 *buf++ = cc;
424 len++;
425 psz++;
426 }
427 else
428 {
429 // BASE64 encoded string
430 bool lsb;
431 unsigned char c;
432 unsigned int d, l;
433 for (lsb = false, d = 0, l = 0;
434 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
435 {
436 d <<= 6;
437 d += cc;
438 for (l += 6; l >= 8; lsb = !lsb)
439 {
440 c = (unsigned char)((d >> (l -= 8)) % 256);
441 if (lsb)
442 {
443 if (buf)
444 *buf++ |= c;
445 len ++;
446 }
447 else
448 if (buf)
449 *buf = (wchar_t)(c << 8);
450 }
451 }
452 if (*psz == '-')
453 psz++;
454 }
455 }
456 if (buf && (len < n))
457 *buf = 0;
458 return len;
459 }
460
461 //
462 // BASE64 encoding table
463 //
464 static const unsigned char utf7enb64[] =
465 {
466 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
467 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
468 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
469 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
470 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
471 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
472 'w', 'x', 'y', 'z', '0', '1', '2', '3',
473 '4', '5', '6', '7', '8', '9', '+', '/'
474 };
475
476 //
477 // UTF-7 encoding table
478 //
479 // 0 - Set D (directly encoded characters)
480 // 1 - Set O (optional direct characters)
481 // 2 - whitespace characters (optional)
482 // 3 - special characters
483 //
484 static const unsigned char utf7encode[128] =
485 {
486 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
487 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
488 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
490 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
492 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
494 };
495
496 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t
497 *psz, size_t n) const
498 {
499
500
501 size_t len = 0;
502
503 while (*psz && ((!buf) || (len < n)))
504 {
505 wchar_t cc = *psz++;
506 if (cc < 0x80 && utf7encode[cc] < 1)
507 {
508 // plain ASCII char
509 if (buf)
510 *buf++ = (char)cc;
511 len++;
512 }
513 #ifndef WC_UTF16
514 else if (((wxUint32)cc) > 0xffff)
515 {
516 // no surrogate pair generation (yet?)
517 return (size_t)-1;
518 }
519 #endif
520 else
521 {
522 if (buf)
523 *buf++ = '+';
524 len++;
525 if (cc != '+')
526 {
527 // BASE64 encode string
528 unsigned int lsb, d, l;
529 for (d = 0, l = 0;; psz++)
530 {
531 for (lsb = 0; lsb < 2; lsb ++)
532 {
533 d <<= 8;
534 d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
535
536 for (l += 8; l >= 6; )
537 {
538 l -= 6;
539 if (buf)
540 *buf++ = utf7enb64[(d >> l) % 64];
541 len++;
542 }
543 }
544 cc = *psz;
545 if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
546 break;
547 }
548 if (l != 0)
549 {
550 if (buf)
551 *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
552 len++;
553 }
554 }
555 if (buf)
556 *buf++ = '-';
557 len++;
558 }
559 }
560 if (buf && (len < n))
561 *buf = 0;
562 return len;
563 }
564
565 // ----------------------------------------------------------------------------
566 // UTF-8
567 // ----------------------------------------------------------------------------
568
569 static wxUint32 utf8_max[]=
570 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
571
572 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
573 {
574 size_t len = 0;
575
576 while (*psz && ((!buf) || (len < n)))
577 {
578 unsigned char cc = *psz++, fc = cc;
579 unsigned cnt;
580 for (cnt = 0; fc & 0x80; cnt++)
581 fc <<= 1;
582 if (!cnt)
583 {
584 // plain ASCII char
585 if (buf)
586 *buf++ = cc;
587 len++;
588 }
589 else
590 {
591 cnt--;
592 if (!cnt)
593 {
594 // invalid UTF-8 sequence
595 return (size_t)-1;
596 }
597 else
598 {
599 unsigned ocnt = cnt - 1;
600 wxUint32 res = cc & (0x3f >> cnt);
601 while (cnt--)
602 {
603 cc = *psz++;
604 if ((cc & 0xC0) != 0x80)
605 {
606 // invalid UTF-8 sequence
607 return (size_t)-1;
608 }
609 res = (res << 6) | (cc & 0x3f);
610 }
611 if (res <= utf8_max[ocnt])
612 {
613 // illegal UTF-8 encoding
614 return (size_t)-1;
615 }
616 #ifdef WC_UTF16
617 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
618 size_t pa = encode_utf16(res, (wxUint16 *)buf);
619 if (pa == (size_t)-1)
620 return (size_t)-1;
621 if (buf)
622 buf += pa;
623 len += pa;
624 #else // !WC_UTF16
625 if (buf)
626 *buf++ = res;
627 len++;
628 #endif // WC_UTF16/!WC_UTF16
629 }
630 }
631 }
632 if (buf && (len < n))
633 *buf = 0;
634 return len;
635 }
636
637 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
638 {
639 size_t len = 0;
640
641 while (*psz && ((!buf) || (len < n)))
642 {
643 wxUint32 cc;
644 #ifdef WC_UTF16
645 // cast is ok for WC_UTF16
646 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
647 psz += (pa == (size_t)-1) ? 1 : pa;
648 #else
649 cc=(*psz++) & 0x7fffffff;
650 #endif
651 unsigned cnt;
652 for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
653 if (!cnt)
654 {
655 // plain ASCII char
656 if (buf)
657 *buf++ = (char) cc;
658 len++;
659 }
660
661 else
662 {
663 len += cnt + 1;
664 if (buf)
665 {
666 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
667 while (cnt--)
668 *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
669 }
670 }
671 }
672
673 if (buf && (len<n)) *buf = 0;
674
675 return len;
676 }
677
678
679
680
681 // ----------------------------------------------------------------------------
682 // UTF-16
683 // ----------------------------------------------------------------------------
684
685 #ifdef WORDS_BIGENDIAN
686 #define wxMBConvUTF16straight wxMBConvUTF16BE
687 #define wxMBConvUTF16swap wxMBConvUTF16LE
688 #else
689 #define wxMBConvUTF16swap wxMBConvUTF16BE
690 #define wxMBConvUTF16straight wxMBConvUTF16LE
691 #endif
692
693
694 #ifdef WC_UTF16
695
696 // copy 16bit MB to 16bit String
697 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
698 {
699 size_t len=0;
700
701 while (*(wxUint16*)psz && (!buf || len < n))
702 {
703 if (buf)
704 *buf++ = *(wxUint16*)psz;
705 len++;
706
707 psz += sizeof(wxUint16);
708 }
709 if (buf && len<n) *buf=0;
710
711 return len;
712 }
713
714
715 // copy 16bit String to 16bit MB
716 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
717 {
718 size_t len=0;
719
720 while (*psz && (!buf || len < n))
721 {
722 if (buf)
723 {
724 *(wxUint16*)buf = *psz;
725 buf += sizeof(wxUint16);
726 }
727 len += sizeof(wxUint16);
728 psz++;
729 }
730 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
731
732 return len;
733 }
734
735
736 // swap 16bit MB to 16bit String
737 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
738 {
739 size_t len=0;
740
741 while (*(wxUint16*)psz && (!buf || len < n))
742 {
743 if (buf)
744 {
745 ((char *)buf)[0] = psz[1];
746 ((char *)buf)[1] = psz[0];
747 buf++;
748 }
749 len++;
750 psz += sizeof(wxUint16);
751 }
752 if (buf && len<n) *buf=0;
753
754 return len;
755 }
756
757
758 // swap 16bit MB to 16bit String
759 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
760 {
761 size_t len=0;
762
763 while (*psz && (!buf || len < n))
764 {
765 if (buf)
766 {
767 *buf++ = ((char*)psz)[1];
768 *buf++ = ((char*)psz)[0];
769 }
770 len += sizeof(wxUint16);
771 psz++;
772 }
773 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
774
775 return len;
776 }
777
778
779 #else // WC_UTF16
780
781
782 // copy 16bit MB to 32bit String
783 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
784 {
785 size_t len=0;
786
787 while (*(wxUint16*)psz && (!buf || len < n))
788 {
789 wxUint32 cc;
790 size_t pa=decode_utf16((wxUint16*)psz, cc);
791 if (pa == (size_t)-1)
792 return pa;
793
794 if (buf)
795 *buf++ = cc;
796 len++;
797 psz += pa * sizeof(wxUint16);
798 }
799 if (buf && len<n) *buf=0;
800
801 return len;
802 }
803
804
805 // copy 32bit String to 16bit MB
806 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
807 {
808 size_t len=0;
809
810 while (*psz && (!buf || len < n))
811 {
812 wxUint16 cc[2];
813 size_t pa=encode_utf16(*psz, cc);
814
815 if (pa == (size_t)-1)
816 return pa;
817
818 if (buf)
819 {
820 *(wxUint16*)buf = cc[0];
821 buf += sizeof(wxUint16);
822 if (pa > 1)
823 {
824 *(wxUint16*)buf = cc[1];
825 buf += sizeof(wxUint16);
826 }
827 }
828
829 len += pa*sizeof(wxUint16);
830 psz++;
831 }
832 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
833
834 return len;
835 }
836
837
838 // swap 16bit MB to 32bit String
839 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
840 {
841 size_t len=0;
842
843 while (*(wxUint16*)psz && (!buf || len < n))
844 {
845 wxUint32 cc;
846 char tmp[4];
847 tmp[0]=psz[1]; tmp[1]=psz[0];
848 tmp[2]=psz[3]; tmp[3]=psz[2];
849
850 size_t pa=decode_utf16((wxUint16*)tmp, cc);
851 if (pa == (size_t)-1)
852 return pa;
853
854 if (buf)
855 *buf++ = cc;
856
857 len++;
858 psz += pa * sizeof(wxUint16);
859 }
860 if (buf && len<n) *buf=0;
861
862 return len;
863 }
864
865
866 // swap 32bit String to 16bit MB
867 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
868 {
869 size_t len=0;
870
871 while (*psz && (!buf || len < n))
872 {
873 wxUint16 cc[2];
874 size_t pa=encode_utf16(*psz, cc);
875
876 if (pa == (size_t)-1)
877 return pa;
878
879 if (buf)
880 {
881 *buf++ = ((char*)cc)[1];
882 *buf++ = ((char*)cc)[0];
883 if (pa > 1)
884 {
885 *buf++ = ((char*)cc)[3];
886 *buf++ = ((char*)cc)[2];
887 }
888 }
889
890 len += pa*sizeof(wxUint16);
891 psz++;
892 }
893 if (buf && len<=n-sizeof(wxUint16)) *(wxUint16*)buf=0;
894
895 return len;
896 }
897
898 #endif // WC_UTF16
899
900
901 // ----------------------------------------------------------------------------
902 // UTF-32
903 // ----------------------------------------------------------------------------
904
905 #ifdef WORDS_BIGENDIAN
906 #define wxMBConvUTF32straight wxMBConvUTF32BE
907 #define wxMBConvUTF32swap wxMBConvUTF32LE
908 #else
909 #define wxMBConvUTF32swap wxMBConvUTF32BE
910 #define wxMBConvUTF32straight wxMBConvUTF32LE
911 #endif
912
913
914 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
915 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
916
917
918 #ifdef WC_UTF16
919
920 // copy 32bit MB to 16bit String
921 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
922 {
923 size_t len=0;
924
925 while (*(wxUint32*)psz && (!buf || len < n))
926 {
927 wxUint16 cc[2];
928
929 size_t pa=encode_utf16(*(wxUint32*)psz, cc);
930 if (pa == (size_t)-1)
931 return pa;
932
933 if (buf)
934 {
935 *buf++ = cc[0];
936 if (pa > 1)
937 *buf++ = cc[1];
938 }
939 len += pa;
940 psz += sizeof(wxUint32);
941 }
942 if (buf && len<n) *buf=0;
943
944 return len;
945 }
946
947
948 // copy 16bit String to 32bit MB
949 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
950 {
951 size_t len=0;
952
953 while (*psz && (!buf || len < n))
954 {
955 wxUint32 cc;
956
957 // cast is ok for WC_UTF16
958 size_t pa = decode_utf16((const wxUint16 *)psz, cc);
959 if (pa == (size_t)-1)
960 return pa;
961
962 if (buf)
963 {
964 *(wxUint32*)buf = cc;
965 buf += sizeof(wxUint32);
966 }
967 len += sizeof(wxUint32);
968 psz += pa;
969 }
970
971 if (buf && len<=n-sizeof(wxUint32))
972 *(wxUint32*)buf=0;
973
974 return len;
975 }
976
977
978
979 // swap 32bit MB to 16bit String
980 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
981 {
982 size_t len=0;
983
984 while (*(wxUint32*)psz && (!buf || len < n))
985 {
986 char tmp[4];
987 tmp[0] = psz[3]; tmp[1] = psz[2];
988 tmp[2] = psz[1]; tmp[3] = psz[0];
989
990
991 wxUint16 cc[2];
992
993 size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
994 if (pa == (size_t)-1)
995 return pa;
996
997 if (buf)
998 {
999 *buf++ = cc[0];
1000 if (pa > 1)
1001 *buf++ = cc[1];
1002 }
1003 len += pa;
1004 psz += sizeof(wxUint32);
1005 }
1006
1007 if (buf && len<n)
1008 *buf=0;
1009
1010 return len;
1011 }
1012
1013
1014 // swap 16bit String to 32bit MB
1015 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1016 {
1017 size_t len=0;
1018
1019 while (*psz && (!buf || len < n))
1020 {
1021 char cc[4];
1022
1023 // cast is ok for WC_UTF16
1024 size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1025 if (pa == (size_t)-1)
1026 return pa;
1027
1028 if (buf)
1029 {
1030 *buf++ = cc[3];
1031 *buf++ = cc[2];
1032 *buf++ = cc[1];
1033 *buf++ = cc[0];
1034 }
1035 len += sizeof(wxUint32);
1036 psz += pa;
1037 }
1038
1039 if (buf && len<=n-sizeof(wxUint32))
1040 *(wxUint32*)buf=0;
1041
1042 return len;
1043 }
1044
1045 #else // WC_UTF16
1046
1047
1048 // copy 32bit MB to 32bit String
1049 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1050 {
1051 size_t len=0;
1052
1053 while (*(wxUint32*)psz && (!buf || len < n))
1054 {
1055 if (buf)
1056 *buf++ = *(wxUint32*)psz;
1057 len++;
1058 psz += sizeof(wxUint32);
1059 }
1060
1061 if (buf && len<n)
1062 *buf=0;
1063
1064 return len;
1065 }
1066
1067
1068 // copy 32bit String to 32bit MB
1069 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1070 {
1071 size_t len=0;
1072
1073 while (*psz && (!buf || len < n))
1074 {
1075 if (buf)
1076 {
1077 *(wxUint32*)buf = *psz;
1078 buf += sizeof(wxUint32);
1079 }
1080
1081 len += sizeof(wxUint32);
1082 psz++;
1083 }
1084
1085 if (buf && len<=n-sizeof(wxUint32))
1086 *(wxUint32*)buf=0;
1087
1088 return len;
1089 }
1090
1091
1092 // swap 32bit MB to 32bit String
1093 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1094 {
1095 size_t len=0;
1096
1097 while (*(wxUint32*)psz && (!buf || len < n))
1098 {
1099 if (buf)
1100 {
1101 ((char *)buf)[0] = psz[3];
1102 ((char *)buf)[1] = psz[2];
1103 ((char *)buf)[2] = psz[1];
1104 ((char *)buf)[3] = psz[0];
1105 buf++;
1106 }
1107 len++;
1108 psz += sizeof(wxUint32);
1109 }
1110
1111 if (buf && len<n)
1112 *buf=0;
1113
1114 return len;
1115 }
1116
1117
1118 // swap 32bit String to 32bit MB
1119 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1120 {
1121 size_t len=0;
1122
1123 while (*psz && (!buf || len < n))
1124 {
1125 if (buf)
1126 {
1127 *buf++ = ((char *)psz)[3];
1128 *buf++ = ((char *)psz)[2];
1129 *buf++ = ((char *)psz)[1];
1130 *buf++ = ((char *)psz)[0];
1131 }
1132 len += sizeof(wxUint32);
1133 psz++;
1134 }
1135
1136 if (buf && len<=n-sizeof(wxUint32))
1137 *(wxUint32*)buf=0;
1138
1139 return len;
1140 }
1141
1142
1143 #endif // WC_UTF16
1144
1145
1146 // ============================================================================
1147 // The classes doing conversion using the iconv_xxx() functions
1148 // ============================================================================
1149
1150 #ifdef HAVE_ICONV
1151
1152 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1153 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1154 // (unless there's yet another bug in glibc) the only case when iconv()
1155 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1156 // left in the input buffer -- when _real_ error occurs,
1157 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1158 // iconv() failure.
1159 // [This bug does not appear in glibc 2.2.]
1160 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1161 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1162 (errno != E2BIG || bufLeft != 0))
1163 #else
1164 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1165 #endif
1166
1167 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1168
1169 // ----------------------------------------------------------------------------
1170 // wxMBConv_iconv: encapsulates an iconv character set
1171 // ----------------------------------------------------------------------------
1172
1173 class wxMBConv_iconv : public wxMBConv
1174 {
1175 public:
1176 wxMBConv_iconv(const wxChar *name);
1177 virtual ~wxMBConv_iconv();
1178
1179 virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1180 virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1181
1182 bool IsOk() const
1183 { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1184
1185 protected:
1186 // the iconv handlers used to translate from multibyte to wide char and in
1187 // the other direction
1188 iconv_t m2w,
1189 w2m;
1190 #if wxUSE_THREADS
1191 // guards access to m2w and w2m objects
1192 wxMutex m_iconvMutex;
1193 #endif
1194
1195 private:
1196 // the name (for iconv_open()) of a wide char charset -- if none is
1197 // available on this machine, it will remain NULL
1198 static const char *ms_wcCharsetName;
1199
1200 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1201 // different endian-ness than the native one
1202 static bool ms_wcNeedsSwap;
1203 };
1204
1205 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1206 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1207
1208 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1209 {
1210 // Do it the hard way
1211 char cname[100];
1212 for (size_t i = 0; i < wxStrlen(name)+1; i++)
1213 cname[i] = (char) name[i];
1214
1215 // check for charset that represents wchar_t:
1216 if (ms_wcCharsetName == NULL)
1217 {
1218 ms_wcNeedsSwap = false;
1219
1220 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1221 ms_wcCharsetName = WC_NAME_BEST;
1222 m2w = iconv_open(ms_wcCharsetName, cname);
1223
1224 if (m2w == (iconv_t)-1)
1225 {
1226 // try charset w/o bytesex info (e.g. "UCS4")
1227 // and check for bytesex ourselves:
1228 ms_wcCharsetName = WC_NAME;
1229 m2w = iconv_open(ms_wcCharsetName, cname);
1230
1231 // last bet, try if it knows WCHAR_T pseudo-charset
1232 if (m2w == (iconv_t)-1)
1233 {
1234 ms_wcCharsetName = "WCHAR_T";
1235 m2w = iconv_open(ms_wcCharsetName, cname);
1236 }
1237
1238 if (m2w != (iconv_t)-1)
1239 {
1240 char buf[2], *bufPtr;
1241 wchar_t wbuf[2], *wbufPtr;
1242 size_t insz, outsz;
1243 size_t res;
1244
1245 buf[0] = 'A';
1246 buf[1] = 0;
1247 wbuf[0] = 0;
1248 insz = 2;
1249 outsz = SIZEOF_WCHAR_T * 2;
1250 wbufPtr = wbuf;
1251 bufPtr = buf;
1252
1253 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1254 (char**)&wbufPtr, &outsz);
1255
1256 if (ICONV_FAILED(res, insz))
1257 {
1258 ms_wcCharsetName = NULL;
1259 wxLogLastError(wxT("iconv"));
1260 wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1261 }
1262 else
1263 {
1264 ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1265 }
1266 }
1267 else
1268 {
1269 ms_wcCharsetName = NULL;
1270
1271 // VS: we must not output an error here, since wxWidgets will safely
1272 // fall back to using wxEncodingConverter.
1273 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1274 //wxLogError(
1275 }
1276 }
1277 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1278 }
1279 else // we already have ms_wcCharsetName
1280 {
1281 m2w = iconv_open(ms_wcCharsetName, cname);
1282 }
1283
1284 // NB: don't ever pass NULL to iconv_open(), it may crash!
1285 if ( ms_wcCharsetName )
1286 {
1287 w2m = iconv_open( cname, ms_wcCharsetName);
1288 }
1289 else
1290 {
1291 w2m = (iconv_t)-1;
1292 }
1293 }
1294
1295 wxMBConv_iconv::~wxMBConv_iconv()
1296 {
1297 if ( m2w != (iconv_t)-1 )
1298 iconv_close(m2w);
1299 if ( w2m != (iconv_t)-1 )
1300 iconv_close(w2m);
1301 }
1302
1303 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1304 {
1305 #if wxUSE_THREADS
1306 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1307 // Unfortunately there is a couple of global wxCSConv objects such as
1308 // wxConvLocal that are used all over wx code, so we have to make sure
1309 // the handle is used by at most one thread at the time. Otherwise
1310 // only a few wx classes would be safe to use from non-main threads
1311 // as MB<->WC conversion would fail "randomly".
1312 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1313 #endif
1314
1315 size_t inbuf = strlen(psz);
1316 size_t outbuf = n * SIZEOF_WCHAR_T;
1317 size_t res, cres;
1318 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1319 wchar_t *bufPtr = buf;
1320 const char *pszPtr = psz;
1321
1322 if (buf)
1323 {
1324 // have destination buffer, convert there
1325 cres = iconv(m2w,
1326 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1327 (char**)&bufPtr, &outbuf);
1328 res = n - (outbuf / SIZEOF_WCHAR_T);
1329
1330 if (ms_wcNeedsSwap)
1331 {
1332 // convert to native endianness
1333 WC_BSWAP(buf /* _not_ bufPtr */, res)
1334 }
1335
1336 // NB: iconv was given only strlen(psz) characters on input, and so
1337 // it couldn't convert the trailing zero. Let's do it ourselves
1338 // if there's some room left for it in the output buffer.
1339 if (res < n)
1340 buf[res] = 0;
1341 }
1342 else
1343 {
1344 // no destination buffer... convert using temp buffer
1345 // to calculate destination buffer requirement
1346 wchar_t tbuf[8];
1347 res = 0;
1348 do {
1349 bufPtr = tbuf;
1350 outbuf = 8*SIZEOF_WCHAR_T;
1351
1352 cres = iconv(m2w,
1353 ICONV_CHAR_CAST(&pszPtr), &inbuf,
1354 (char**)&bufPtr, &outbuf );
1355
1356 res += 8-(outbuf/SIZEOF_WCHAR_T);
1357 } while ((cres==(size_t)-1) && (errno==E2BIG));
1358 }
1359
1360 if (ICONV_FAILED(cres, inbuf))
1361 {
1362 //VS: it is ok if iconv fails, hence trace only
1363 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1364 return (size_t)-1;
1365 }
1366
1367 return res;
1368 }
1369
1370 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1371 {
1372 #if wxUSE_THREADS
1373 // NB: explained in MB2WC
1374 wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1375 #endif
1376
1377 size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1378 size_t outbuf = n;
1379 size_t res, cres;
1380
1381 wchar_t *tmpbuf = 0;
1382
1383 if (ms_wcNeedsSwap)
1384 {
1385 // need to copy to temp buffer to switch endianness
1386 // this absolutely doesn't rock!
1387 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1388 // could be in read-only memory, or be accessed in some other thread)
1389 tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1390 memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1391 WC_BSWAP(tmpbuf, inbuf)
1392 psz=tmpbuf;
1393 }
1394
1395 if (buf)
1396 {
1397 // have destination buffer, convert there
1398 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1399
1400 res = n-outbuf;
1401
1402 // NB: iconv was given only wcslen(psz) characters on input, and so
1403 // it couldn't convert the trailing zero. Let's do it ourselves
1404 // if there's some room left for it in the output buffer.
1405 if (res < n)
1406 buf[0] = 0;
1407 }
1408 else
1409 {
1410 // no destination buffer... convert using temp buffer
1411 // to calculate destination buffer requirement
1412 char tbuf[16];
1413 res = 0;
1414 do {
1415 buf = tbuf; outbuf = 16;
1416
1417 cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1418
1419 res += 16 - outbuf;
1420 } while ((cres==(size_t)-1) && (errno==E2BIG));
1421 }
1422
1423 if (ms_wcNeedsSwap)
1424 {
1425 free(tmpbuf);
1426 }
1427
1428 if (ICONV_FAILED(cres, inbuf))
1429 {
1430 //VS: it is ok if iconv fails, hence trace only
1431 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1432 return (size_t)-1;
1433 }
1434
1435 return res;
1436 }
1437
1438 #endif // HAVE_ICONV
1439
1440
1441 // ============================================================================
1442 // Win32 conversion classes
1443 // ============================================================================
1444
1445 #ifdef wxHAVE_WIN32_MB2WC
1446
1447 // from utils.cpp
1448 #if wxUSE_FONTMAP
1449 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1450 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1451 #endif
1452
1453 class wxMBConv_win32 : public wxMBConv
1454 {
1455 public:
1456 wxMBConv_win32()
1457 {
1458 m_CodePage = CP_ACP;
1459 }
1460
1461 #if wxUSE_FONTMAP
1462 wxMBConv_win32(const wxChar* name)
1463 {
1464 m_CodePage = wxCharsetToCodepage(name);
1465 }
1466
1467 wxMBConv_win32(wxFontEncoding encoding)
1468 {
1469 m_CodePage = wxEncodingToCodepage(encoding);
1470 }
1471 #endif
1472
1473 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1474 {
1475 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1476 // the behaviour is not compatible with the Unix version (using iconv)
1477 // and break the library itself, e.g. wxTextInputStream::NextChar()
1478 // wouldn't work if reading an incomplete MB char didn't result in an
1479 // error
1480 const size_t len = ::MultiByteToWideChar
1481 (
1482 m_CodePage, // code page
1483 MB_ERR_INVALID_CHARS, // flags: fall on error
1484 psz, // input string
1485 -1, // its length (NUL-terminated)
1486 buf, // output string
1487 buf ? n : 0 // size of output buffer
1488 );
1489
1490 // note that it returns count of written chars for buf != NULL and size
1491 // of the needed buffer for buf == NULL so in either case the length of
1492 // the string (which never includes the terminating NUL) is one less
1493 return len ? len - 1 : (size_t)-1;
1494 }
1495
1496 size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1497 {
1498 /*
1499 we have a problem here: by default, WideCharToMultiByte() may
1500 replace characters unrepresentable in the target code page with bad
1501 quality approximations such as turning "1/2" symbol (U+00BD) into
1502 "1" for the code pages which don't have it and we, obviously, want
1503 to avoid this at any price
1504
1505 the trouble is that this function does it _silently_, i.e. it won't
1506 even tell us whether it did or not... Win98/2000 and higher provide
1507 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1508 we have to resort to a round trip, i.e. check that converting back
1509 results in the same string -- this is, of course, expensive but
1510 otherwise we simply can't be sure to not garble the data.
1511 */
1512
1513 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1514 // it doesn't work with CJK encodings (which we test for rather roughly
1515 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1516 // supporting it
1517 BOOL usedDef wxDUMMY_INITIALIZE(false);
1518 BOOL *pUsedDef;
1519 int flags;
1520 if ( CanUseNoBestFit() && m_CodePage < 50000 )
1521 {
1522 // it's our lucky day
1523 flags = WC_NO_BEST_FIT_CHARS;
1524 pUsedDef = &usedDef;
1525 }
1526 else // old system or unsupported encoding
1527 {
1528 flags = 0;
1529 pUsedDef = NULL;
1530 }
1531
1532 const size_t len = ::WideCharToMultiByte
1533 (
1534 m_CodePage, // code page
1535 flags, // either none or no best fit
1536 pwz, // input string
1537 -1, // it is (wide) NUL-terminated
1538 buf, // output buffer
1539 buf ? n : 0, // and its size
1540 NULL, // default "replacement" char
1541 pUsedDef // [out] was it used?
1542 );
1543
1544 if ( !len )
1545 {
1546 // function totally failed
1547 return (size_t)-1;
1548 }
1549
1550 // if we were really converting, check if we succeeded
1551 if ( buf )
1552 {
1553 if ( flags )
1554 {
1555 // check if the conversion failed, i.e. if any replacements
1556 // were done
1557 if ( usedDef )
1558 return (size_t)-1;
1559 }
1560 else // we must resort to double tripping...
1561 {
1562 wxWCharBuffer wcBuf(n);
1563 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1564 wcscmp(wcBuf, pwz) != 0 )
1565 {
1566 // we didn't obtain the same thing we started from, hence
1567 // the conversion was lossy and we consider that it failed
1568 return (size_t)-1;
1569 }
1570 }
1571 }
1572
1573 // see the comment above for the reason of "len - 1"
1574 return len - 1;
1575 }
1576
1577 bool IsOk() const { return m_CodePage != -1; }
1578
1579 private:
1580 static bool CanUseNoBestFit()
1581 {
1582 static int s_isWin98Or2k = -1;
1583
1584 if ( s_isWin98Or2k == -1 )
1585 {
1586 int verMaj, verMin;
1587 switch ( wxGetOsVersion(&verMaj, &verMin) )
1588 {
1589 case wxWIN95:
1590 s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1591 break;
1592
1593 case wxWINDOWS_NT:
1594 s_isWin98Or2k = verMaj >= 5;
1595 break;
1596
1597 default:
1598 // unknown, be conseravtive by default
1599 s_isWin98Or2k = 0;
1600 }
1601
1602 wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1603 }
1604
1605 return s_isWin98Or2k == 1;
1606 }
1607
1608 long m_CodePage;
1609 };
1610
1611 #endif // wxHAVE_WIN32_MB2WC
1612
1613 // ============================================================================
1614 // Cocoa conversion classes
1615 // ============================================================================
1616
1617 #if defined(__WXCOCOA__)
1618
1619 // RN: There is no UTF-32 support in either Core Foundation or
1620 // Cocoa. Strangely enough, internally Core Foundation uses
1621 // UTF 32 internally quite a bit - its just not public (yet).
1622
1623 #include <CoreFoundation/CFString.h>
1624 #include <CoreFoundation/CFStringEncodingExt.h>
1625
1626 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1627 {
1628 CFStringEncoding enc = kCFStringEncodingInvalidId ;
1629 if ( encoding == wxFONTENCODING_DEFAULT )
1630 {
1631 enc = CFStringGetSystemEncoding();
1632 }
1633 else switch( encoding)
1634 {
1635 case wxFONTENCODING_ISO8859_1 :
1636 enc = kCFStringEncodingISOLatin1 ;
1637 break ;
1638 case wxFONTENCODING_ISO8859_2 :
1639 enc = kCFStringEncodingISOLatin2;
1640 break ;
1641 case wxFONTENCODING_ISO8859_3 :
1642 enc = kCFStringEncodingISOLatin3 ;
1643 break ;
1644 case wxFONTENCODING_ISO8859_4 :
1645 enc = kCFStringEncodingISOLatin4;
1646 break ;
1647 case wxFONTENCODING_ISO8859_5 :
1648 enc = kCFStringEncodingISOLatinCyrillic;
1649 break ;
1650 case wxFONTENCODING_ISO8859_6 :
1651 enc = kCFStringEncodingISOLatinArabic;
1652 break ;
1653 case wxFONTENCODING_ISO8859_7 :
1654 enc = kCFStringEncodingISOLatinGreek;
1655 break ;
1656 case wxFONTENCODING_ISO8859_8 :
1657 enc = kCFStringEncodingISOLatinHebrew;
1658 break ;
1659 case wxFONTENCODING_ISO8859_9 :
1660 enc = kCFStringEncodingISOLatin5;
1661 break ;
1662 case wxFONTENCODING_ISO8859_10 :
1663 enc = kCFStringEncodingISOLatin6;
1664 break ;
1665 case wxFONTENCODING_ISO8859_11 :
1666 enc = kCFStringEncodingISOLatinThai;
1667 break ;
1668 case wxFONTENCODING_ISO8859_13 :
1669 enc = kCFStringEncodingISOLatin7;
1670 break ;
1671 case wxFONTENCODING_ISO8859_14 :
1672 enc = kCFStringEncodingISOLatin8;
1673 break ;
1674 case wxFONTENCODING_ISO8859_15 :
1675 enc = kCFStringEncodingISOLatin9;
1676 break ;
1677
1678 case wxFONTENCODING_KOI8 :
1679 enc = kCFStringEncodingKOI8_R;
1680 break ;
1681 case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1682 enc = kCFStringEncodingDOSRussian;
1683 break ;
1684
1685 // case wxFONTENCODING_BULGARIAN :
1686 // enc = ;
1687 // break ;
1688
1689 case wxFONTENCODING_CP437 :
1690 enc =kCFStringEncodingDOSLatinUS ;
1691 break ;
1692 case wxFONTENCODING_CP850 :
1693 enc = kCFStringEncodingDOSLatin1;
1694 break ;
1695 case wxFONTENCODING_CP852 :
1696 enc = kCFStringEncodingDOSLatin2;
1697 break ;
1698 case wxFONTENCODING_CP855 :
1699 enc = kCFStringEncodingDOSCyrillic;
1700 break ;
1701 case wxFONTENCODING_CP866 :
1702 enc =kCFStringEncodingDOSRussian ;
1703 break ;
1704 case wxFONTENCODING_CP874 :
1705 enc = kCFStringEncodingDOSThai;
1706 break ;
1707 case wxFONTENCODING_CP932 :
1708 enc = kCFStringEncodingDOSJapanese;
1709 break ;
1710 case wxFONTENCODING_CP936 :
1711 enc =kCFStringEncodingDOSChineseSimplif ;
1712 break ;
1713 case wxFONTENCODING_CP949 :
1714 enc = kCFStringEncodingDOSKorean;
1715 break ;
1716 case wxFONTENCODING_CP950 :
1717 enc = kCFStringEncodingDOSChineseTrad;
1718 break ;
1719 case wxFONTENCODING_CP1250 :
1720 enc = kCFStringEncodingWindowsLatin2;
1721 break ;
1722 case wxFONTENCODING_CP1251 :
1723 enc =kCFStringEncodingWindowsCyrillic ;
1724 break ;
1725 case wxFONTENCODING_CP1252 :
1726 enc =kCFStringEncodingWindowsLatin1 ;
1727 break ;
1728 case wxFONTENCODING_CP1253 :
1729 enc = kCFStringEncodingWindowsGreek;
1730 break ;
1731 case wxFONTENCODING_CP1254 :
1732 enc = kCFStringEncodingWindowsLatin5;
1733 break ;
1734 case wxFONTENCODING_CP1255 :
1735 enc =kCFStringEncodingWindowsHebrew ;
1736 break ;
1737 case wxFONTENCODING_CP1256 :
1738 enc =kCFStringEncodingWindowsArabic ;
1739 break ;
1740 case wxFONTENCODING_CP1257 :
1741 enc = kCFStringEncodingWindowsBalticRim;
1742 break ;
1743 // This only really encodes to UTF7 (if that) evidently
1744 // case wxFONTENCODING_UTF7 :
1745 // enc = kCFStringEncodingNonLossyASCII ;
1746 // break ;
1747 case wxFONTENCODING_UTF8 :
1748 enc = kCFStringEncodingUTF8 ;
1749 break ;
1750 case wxFONTENCODING_EUC_JP :
1751 enc = kCFStringEncodingEUC_JP;
1752 break ;
1753 case wxFONTENCODING_UTF16 :
1754 enc = kCFStringEncodingUnicode ;
1755 break ;
1756 case wxFONTENCODING_MACROMAN :
1757 enc = kCFStringEncodingMacRoman ;
1758 break ;
1759 case wxFONTENCODING_MACJAPANESE :
1760 enc = kCFStringEncodingMacJapanese ;
1761 break ;
1762 case wxFONTENCODING_MACCHINESETRAD :
1763 enc = kCFStringEncodingMacChineseTrad ;
1764 break ;
1765 case wxFONTENCODING_MACKOREAN :
1766 enc = kCFStringEncodingMacKorean ;
1767 break ;
1768 case wxFONTENCODING_MACARABIC :
1769 enc = kCFStringEncodingMacArabic ;
1770 break ;
1771 case wxFONTENCODING_MACHEBREW :
1772 enc = kCFStringEncodingMacHebrew ;
1773 break ;
1774 case wxFONTENCODING_MACGREEK :
1775 enc = kCFStringEncodingMacGreek ;
1776 break ;
1777 case wxFONTENCODING_MACCYRILLIC :
1778 enc = kCFStringEncodingMacCyrillic ;
1779 break ;
1780 case wxFONTENCODING_MACDEVANAGARI :
1781 enc = kCFStringEncodingMacDevanagari ;
1782 break ;
1783 case wxFONTENCODING_MACGURMUKHI :
1784 enc = kCFStringEncodingMacGurmukhi ;
1785 break ;
1786 case wxFONTENCODING_MACGUJARATI :
1787 enc = kCFStringEncodingMacGujarati ;
1788 break ;
1789 case wxFONTENCODING_MACORIYA :
1790 enc = kCFStringEncodingMacOriya ;
1791 break ;
1792 case wxFONTENCODING_MACBENGALI :
1793 enc = kCFStringEncodingMacBengali ;
1794 break ;
1795 case wxFONTENCODING_MACTAMIL :
1796 enc = kCFStringEncodingMacTamil ;
1797 break ;
1798 case wxFONTENCODING_MACTELUGU :
1799 enc = kCFStringEncodingMacTelugu ;
1800 break ;
1801 case wxFONTENCODING_MACKANNADA :
1802 enc = kCFStringEncodingMacKannada ;
1803 break ;
1804 case wxFONTENCODING_MACMALAJALAM :
1805 enc = kCFStringEncodingMacMalayalam ;
1806 break ;
1807 case wxFONTENCODING_MACSINHALESE :
1808 enc = kCFStringEncodingMacSinhalese ;
1809 break ;
1810 case wxFONTENCODING_MACBURMESE :
1811 enc = kCFStringEncodingMacBurmese ;
1812 break ;
1813 case wxFONTENCODING_MACKHMER :
1814 enc = kCFStringEncodingMacKhmer ;
1815 break ;
1816 case wxFONTENCODING_MACTHAI :
1817 enc = kCFStringEncodingMacThai ;
1818 break ;
1819 case wxFONTENCODING_MACLAOTIAN :
1820 enc = kCFStringEncodingMacLaotian ;
1821 break ;
1822 case wxFONTENCODING_MACGEORGIAN :
1823 enc = kCFStringEncodingMacGeorgian ;
1824 break ;
1825 case wxFONTENCODING_MACARMENIAN :
1826 enc = kCFStringEncodingMacArmenian ;
1827 break ;
1828 case wxFONTENCODING_MACCHINESESIMP :
1829 enc = kCFStringEncodingMacChineseSimp ;
1830 break ;
1831 case wxFONTENCODING_MACTIBETAN :
1832 enc = kCFStringEncodingMacTibetan ;
1833 break ;
1834 case wxFONTENCODING_MACMONGOLIAN :
1835 enc = kCFStringEncodingMacMongolian ;
1836 break ;
1837 case wxFONTENCODING_MACETHIOPIC :
1838 enc = kCFStringEncodingMacEthiopic ;
1839 break ;
1840 case wxFONTENCODING_MACCENTRALEUR :
1841 enc = kCFStringEncodingMacCentralEurRoman ;
1842 break ;
1843 case wxFONTENCODING_MACVIATNAMESE :
1844 enc = kCFStringEncodingMacVietnamese ;
1845 break ;
1846 case wxFONTENCODING_MACARABICEXT :
1847 enc = kCFStringEncodingMacExtArabic ;
1848 break ;
1849 case wxFONTENCODING_MACSYMBOL :
1850 enc = kCFStringEncodingMacSymbol ;
1851 break ;
1852 case wxFONTENCODING_MACDINGBATS :
1853 enc = kCFStringEncodingMacDingbats ;
1854 break ;
1855 case wxFONTENCODING_MACTURKISH :
1856 enc = kCFStringEncodingMacTurkish ;
1857 break ;
1858 case wxFONTENCODING_MACCROATIAN :
1859 enc = kCFStringEncodingMacCroatian ;
1860 break ;
1861 case wxFONTENCODING_MACICELANDIC :
1862 enc = kCFStringEncodingMacIcelandic ;
1863 break ;
1864 case wxFONTENCODING_MACROMANIAN :
1865 enc = kCFStringEncodingMacRomanian ;
1866 break ;
1867 case wxFONTENCODING_MACCELTIC :
1868 enc = kCFStringEncodingMacCeltic ;
1869 break ;
1870 case wxFONTENCODING_MACGAELIC :
1871 enc = kCFStringEncodingMacGaelic ;
1872 break ;
1873 // case wxFONTENCODING_MACKEYBOARD :
1874 // enc = kCFStringEncodingMacKeyboardGlyphs ;
1875 // break ;
1876 default :
1877 // because gcc is picky
1878 break ;
1879 } ;
1880 return enc ;
1881 }
1882
1883 class wxMBConv_cocoa : public wxMBConv
1884 {
1885 public:
1886 wxMBConv_cocoa()
1887 {
1888 Init(CFStringGetSystemEncoding()) ;
1889 }
1890
1891 wxMBConv_cocoa(const wxChar* name)
1892 {
1893 Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
1894 }
1895
1896 wxMBConv_cocoa(wxFontEncoding encoding)
1897 {
1898 Init( wxCFStringEncFromFontEnc(encoding) );
1899 }
1900
1901 ~wxMBConv_cocoa()
1902 {
1903 }
1904
1905 void Init( CFStringEncoding encoding)
1906 {
1907 m_encoding = encoding ;
1908 }
1909
1910 size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
1911 {
1912 wxASSERT(szUnConv);
1913
1914 CFStringRef theString = CFStringCreateWithBytes (
1915 NULL, //the allocator
1916 (const UInt8*)szUnConv,
1917 strlen(szUnConv),
1918 m_encoding,
1919 false //no BOM/external representation
1920 );
1921
1922 wxASSERT(theString);
1923
1924 size_t nOutLength = CFStringGetLength(theString);
1925
1926 if (szOut == NULL)
1927 {
1928 CFRelease(theString);
1929 return nOutLength;
1930 }
1931
1932 CFRange theRange = { 0, nOutSize };
1933
1934 #if SIZEOF_WCHAR_T == 4
1935 UniChar* szUniCharBuffer = new UniChar[nOutSize];
1936 #endif
1937
1938 CFStringGetCharacters(theString, theRange, szUniCharBuffer);
1939
1940 CFRelease(theString);
1941
1942 szUniCharBuffer[nOutLength] = '\0' ;
1943
1944 #if SIZEOF_WCHAR_T == 4
1945 wxMBConvUTF16 converter ;
1946 converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
1947 delete[] szUniCharBuffer;
1948 #endif
1949
1950 return nOutLength;
1951 }
1952
1953 size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
1954 {
1955 wxASSERT(szUnConv);
1956
1957 size_t nRealOutSize;
1958 size_t nBufSize = wxWcslen(szUnConv);
1959 UniChar* szUniBuffer = (UniChar*) szUnConv;
1960
1961 #if SIZEOF_WCHAR_T == 4
1962 wxMBConvUTF16BE converter ;
1963 nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
1964 szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
1965 converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
1966 nBufSize /= sizeof(UniChar);
1967 #endif
1968
1969 CFStringRef theString = CFStringCreateWithCharactersNoCopy(
1970 NULL, //allocator
1971 szUniBuffer,
1972 nBufSize,
1973 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
1974 );
1975
1976 wxASSERT(theString);
1977
1978 //Note that CER puts a BOM when converting to unicode
1979 //so we check and use getchars instead in that case
1980 if (m_encoding == kCFStringEncodingUnicode)
1981 {
1982 if (szOut != NULL)
1983 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
1984
1985 nRealOutSize = CFStringGetLength(theString) + 1;
1986 }
1987 else
1988 {
1989 CFStringGetBytes(
1990 theString,
1991 CFRangeMake(0, CFStringGetLength(theString)),
1992 m_encoding,
1993 0, //what to put in characters that can't be converted -
1994 //0 tells CFString to return NULL if it meets such a character
1995 false, //not an external representation
1996 (UInt8*) szOut,
1997 nOutSize,
1998 (CFIndex*) &nRealOutSize
1999 );
2000 }
2001
2002 CFRelease(theString);
2003
2004 #if SIZEOF_WCHAR_T == 4
2005 delete[] szUniBuffer;
2006 #endif
2007
2008 return nRealOutSize - 1;
2009 }
2010
2011 bool IsOk() const
2012 {
2013 return m_encoding != kCFStringEncodingInvalidId &&
2014 CFStringIsEncodingAvailable(m_encoding);
2015 }
2016
2017 private:
2018 CFStringEncoding m_encoding ;
2019 };
2020
2021 #endif // defined(__WXCOCOA__)
2022
2023 // ============================================================================
2024 // Mac conversion classes
2025 // ============================================================================
2026
2027 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2028
2029 class wxMBConv_mac : public wxMBConv
2030 {
2031 public:
2032 wxMBConv_mac()
2033 {
2034 Init(CFStringGetSystemEncoding()) ;
2035 }
2036
2037 wxMBConv_mac(const wxChar* name)
2038 {
2039 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
2040 }
2041
2042 wxMBConv_mac(wxFontEncoding encoding)
2043 {
2044 Init( wxMacGetSystemEncFromFontEnc(encoding) );
2045 }
2046
2047 ~wxMBConv_mac()
2048 {
2049 OSStatus status = noErr ;
2050 status = TECDisposeConverter(m_MB2WC_converter);
2051 status = TECDisposeConverter(m_WC2MB_converter);
2052 }
2053
2054
2055 void Init( TextEncodingBase encoding)
2056 {
2057 OSStatus status = noErr ;
2058 m_char_encoding = encoding ;
2059 m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2060
2061 status = TECCreateConverter(&m_MB2WC_converter,
2062 m_char_encoding,
2063 m_unicode_encoding);
2064 status = TECCreateConverter(&m_WC2MB_converter,
2065 m_unicode_encoding,
2066 m_char_encoding);
2067 }
2068
2069 size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2070 {
2071 OSStatus status = noErr ;
2072 ByteCount byteOutLen ;
2073 ByteCount byteInLen = strlen(psz) ;
2074 wchar_t *tbuf = NULL ;
2075 UniChar* ubuf = NULL ;
2076 size_t res = 0 ;
2077
2078 if (buf == NULL)
2079 {
2080 //apple specs say at least 32
2081 n = wxMax( 32 , byteInLen ) ;
2082 tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2083 }
2084 ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2085 #if SIZEOF_WCHAR_T == 4
2086 ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2087 #else
2088 ubuf = (UniChar*) (buf ? buf : tbuf) ;
2089 #endif
2090 status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2091 (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2092 #if SIZEOF_WCHAR_T == 4
2093 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2094 // is not properly terminated we get random characters at the end
2095 ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2096 wxMBConvUTF16BE converter ;
2097 res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2098 free( ubuf ) ;
2099 #else
2100 res = byteOutLen / sizeof( UniChar ) ;
2101 #endif
2102 if ( buf == NULL )
2103 free(tbuf) ;
2104
2105 if ( buf && res < n)
2106 buf[res] = 0;
2107
2108 return res ;
2109 }
2110
2111 size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2112 {
2113 OSStatus status = noErr ;
2114 ByteCount byteOutLen ;
2115 ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2116
2117 char *tbuf = NULL ;
2118
2119 if (buf == NULL)
2120 {
2121 //apple specs say at least 32
2122 n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2123 tbuf = (char*) malloc( n ) ;
2124 }
2125
2126 ByteCount byteBufferLen = n ;
2127 UniChar* ubuf = NULL ;
2128 #if SIZEOF_WCHAR_T == 4
2129 wxMBConvUTF16BE converter ;
2130 size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2131 byteInLen = unicharlen ;
2132 ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2133 converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2134 #else
2135 ubuf = (UniChar*) psz ;
2136 #endif
2137 status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2138 (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2139 #if SIZEOF_WCHAR_T == 4
2140 free( ubuf ) ;
2141 #endif
2142 if ( buf == NULL )
2143 free(tbuf) ;
2144
2145 size_t res = byteOutLen ;
2146 if ( buf && res < n)
2147 {
2148 buf[res] = 0;
2149
2150 //we need to double-trip to verify it didn't insert any ? in place
2151 //of bogus characters
2152 wxWCharBuffer wcBuf(n);
2153 size_t pszlen = wxWcslen(psz);
2154 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2155 wxWcslen(wcBuf) != pszlen ||
2156 memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2157 {
2158 // we didn't obtain the same thing we started from, hence
2159 // the conversion was lossy and we consider that it failed
2160 return (size_t)-1;
2161 }
2162 }
2163
2164 return res ;
2165 }
2166
2167 bool IsOk() const
2168 { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL ; }
2169
2170 private:
2171 TECObjectRef m_MB2WC_converter ;
2172 TECObjectRef m_WC2MB_converter ;
2173
2174 TextEncodingBase m_char_encoding ;
2175 TextEncodingBase m_unicode_encoding ;
2176 };
2177
2178 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2179
2180 // ============================================================================
2181 // wxEncodingConverter based conversion classes
2182 // ============================================================================
2183
2184 #if wxUSE_FONTMAP
2185
2186 class wxMBConv_wxwin : public wxMBConv
2187 {
2188 private:
2189 void Init()
2190 {
2191 m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2192 w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2193 }
2194
2195 public:
2196 // temporarily just use wxEncodingConverter stuff,
2197 // so that it works while a better implementation is built
2198 wxMBConv_wxwin(const wxChar* name)
2199 {
2200 if (name)
2201 m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
2202 else
2203 m_enc = wxFONTENCODING_SYSTEM;
2204
2205 Init();
2206 }
2207
2208 wxMBConv_wxwin(wxFontEncoding enc)
2209 {
2210 m_enc = enc;
2211
2212 Init();
2213 }
2214
2215 size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2216 {
2217 size_t inbuf = strlen(psz);
2218 if (buf)
2219 {
2220 if (!m2w.Convert(psz,buf))
2221 return (size_t)-1;
2222 }
2223 return inbuf;
2224 }
2225
2226 size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2227 {
2228 const size_t inbuf = wxWcslen(psz);
2229 if (buf)
2230 {
2231 if (!w2m.Convert(psz,buf))
2232 return (size_t)-1;
2233 }
2234
2235 return inbuf;
2236 }
2237
2238 bool IsOk() const { return m_ok; }
2239
2240 public:
2241 wxFontEncoding m_enc;
2242 wxEncodingConverter m2w, w2m;
2243
2244 // were we initialized successfully?
2245 bool m_ok;
2246
2247 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2248 };
2249
2250 #endif // wxUSE_FONTMAP
2251
2252 // ============================================================================
2253 // wxCSConv implementation
2254 // ============================================================================
2255
2256 void wxCSConv::Init()
2257 {
2258 m_name = NULL;
2259 m_convReal = NULL;
2260 m_deferred = true;
2261 }
2262
2263 wxCSConv::wxCSConv(const wxChar *charset)
2264 {
2265 Init();
2266
2267 if ( charset )
2268 {
2269 SetName(charset);
2270 }
2271
2272 m_encoding = wxFONTENCODING_SYSTEM;
2273 }
2274
2275 wxCSConv::wxCSConv(wxFontEncoding encoding)
2276 {
2277 if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2278 {
2279 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2280
2281 encoding = wxFONTENCODING_SYSTEM;
2282 }
2283
2284 Init();
2285
2286 m_encoding = encoding;
2287 }
2288
2289 wxCSConv::~wxCSConv()
2290 {
2291 Clear();
2292 }
2293
2294 wxCSConv::wxCSConv(const wxCSConv& conv)
2295 : wxMBConv()
2296 {
2297 Init();
2298
2299 SetName(conv.m_name);
2300 m_encoding = conv.m_encoding;
2301 }
2302
2303 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2304 {
2305 Clear();
2306
2307 SetName(conv.m_name);
2308 m_encoding = conv.m_encoding;
2309
2310 return *this;
2311 }
2312
2313 void wxCSConv::Clear()
2314 {
2315 free(m_name);
2316 delete m_convReal;
2317
2318 m_name = NULL;
2319 m_convReal = NULL;
2320 }
2321
2322 void wxCSConv::SetName(const wxChar *charset)
2323 {
2324 if (charset)
2325 {
2326 m_name = wxStrdup(charset);
2327 m_deferred = true;
2328 }
2329 }
2330
2331 wxMBConv *wxCSConv::DoCreate() const
2332 {
2333 // check for the special case of ASCII or ISO8859-1 charset: as we have
2334 // special knowledge of it anyhow, we don't need to create a special
2335 // conversion object
2336 if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2337 {
2338 // don't convert at all
2339 return NULL;
2340 }
2341
2342 // we trust OS to do conversion better than we can so try external
2343 // conversion methods first
2344 //
2345 // the full order is:
2346 // 1. OS conversion (iconv() under Unix or Win32 API)
2347 // 2. hard coded conversions for UTF
2348 // 3. wxEncodingConverter as fall back
2349
2350 // step (1)
2351 #ifdef HAVE_ICONV
2352 #if !wxUSE_FONTMAP
2353 if ( m_name )
2354 #endif // !wxUSE_FONTMAP
2355 {
2356 wxString name(m_name);
2357
2358 #if wxUSE_FONTMAP
2359 if ( name.empty() )
2360 name = wxFontMapper::Get()->GetEncodingName(m_encoding);
2361 #endif // wxUSE_FONTMAP
2362
2363 wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2364 if ( conv->IsOk() )
2365 return conv;
2366
2367 delete conv;
2368 }
2369 #endif // HAVE_ICONV
2370
2371 #ifdef wxHAVE_WIN32_MB2WC
2372 {
2373 #if wxUSE_FONTMAP
2374 wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2375 : new wxMBConv_win32(m_encoding);
2376 if ( conv->IsOk() )
2377 return conv;
2378
2379 delete conv;
2380 #else
2381 return NULL;
2382 #endif
2383 }
2384 #endif // wxHAVE_WIN32_MB2WC
2385 #if defined(__WXMAC__)
2386 {
2387 if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) )
2388 {
2389
2390 wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2391 : new wxMBConv_mac(m_encoding);
2392 if ( conv->IsOk() )
2393 return conv;
2394
2395 delete conv;
2396 }
2397 }
2398 #endif
2399 #if defined(__WXCOCOA__)
2400 {
2401 if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2402 {
2403
2404 wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2405 : new wxMBConv_cocoa(m_encoding);
2406 if ( conv->IsOk() )
2407 return conv;
2408
2409 delete conv;
2410 }
2411 }
2412 #endif
2413 // step (2)
2414 wxFontEncoding enc = m_encoding;
2415 #if wxUSE_FONTMAP
2416 if ( enc == wxFONTENCODING_SYSTEM && m_name )
2417 {
2418 // use "false" to suppress interactive dialogs -- we can be called from
2419 // anywhere and popping up a dialog from here is the last thing we want to
2420 // do
2421 enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
2422 }
2423 #endif // wxUSE_FONTMAP
2424
2425 switch ( enc )
2426 {
2427 case wxFONTENCODING_UTF7:
2428 return new wxMBConvUTF7;
2429
2430 case wxFONTENCODING_UTF8:
2431 return new wxMBConvUTF8;
2432
2433 case wxFONTENCODING_UTF16BE:
2434 return new wxMBConvUTF16BE;
2435
2436 case wxFONTENCODING_UTF16LE:
2437 return new wxMBConvUTF16LE;
2438
2439 case wxFONTENCODING_UTF32BE:
2440 return new wxMBConvUTF32BE;
2441
2442 case wxFONTENCODING_UTF32LE:
2443 return new wxMBConvUTF32LE;
2444
2445 default:
2446 // nothing to do but put here to suppress gcc warnings
2447 ;
2448 }
2449
2450 // step (3)
2451 #if wxUSE_FONTMAP
2452 {
2453 wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2454 : new wxMBConv_wxwin(m_encoding);
2455 if ( conv->IsOk() )
2456 return conv;
2457
2458 delete conv;
2459 }
2460 #endif // wxUSE_FONTMAP
2461
2462 // NB: This is a hack to prevent deadlock. What could otherwise happen
2463 // in Unicode build: wxConvLocal creation ends up being here
2464 // because of some failure and logs the error. But wxLog will try to
2465 // attach timestamp, for which it will need wxConvLocal (to convert
2466 // time to char* and then wchar_t*), but that fails, tries to log
2467 // error, but wxLog has a (already locked) critical section that
2468 // guards static buffer.
2469 static bool alreadyLoggingError = false;
2470 if (!alreadyLoggingError)
2471 {
2472 alreadyLoggingError = true;
2473 wxLogError(_("Cannot convert from the charset '%s'!"),
2474 m_name ? m_name
2475 :
2476 #if wxUSE_FONTMAP
2477 wxFontMapper::GetEncodingDescription(m_encoding).c_str()
2478 #else // !wxUSE_FONTMAP
2479 wxString::Format(_("encoding %s"), m_encoding).c_str()
2480 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2481 );
2482 alreadyLoggingError = false;
2483 }
2484
2485 return NULL;
2486 }
2487
2488 void wxCSConv::CreateConvIfNeeded() const
2489 {
2490 if ( m_deferred )
2491 {
2492 wxCSConv *self = (wxCSConv *)this; // const_cast
2493
2494 #if wxUSE_INTL
2495 // if we don't have neither the name nor the encoding, use the default
2496 // encoding for this system
2497 if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2498 {
2499 self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2500 }
2501 #endif // wxUSE_INTL
2502
2503 self->m_convReal = DoCreate();
2504 self->m_deferred = false;
2505 }
2506 }
2507
2508 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2509 {
2510 CreateConvIfNeeded();
2511
2512 if (m_convReal)
2513 return m_convReal->MB2WC(buf, psz, n);
2514
2515 // latin-1 (direct)
2516 size_t len = strlen(psz);
2517
2518 if (buf)
2519 {
2520 for (size_t c = 0; c <= len; c++)
2521 buf[c] = (unsigned char)(psz[c]);
2522 }
2523
2524 return len;
2525 }
2526
2527 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2528 {
2529 CreateConvIfNeeded();
2530
2531 if (m_convReal)
2532 return m_convReal->WC2MB(buf, psz, n);
2533
2534 // latin-1 (direct)
2535 const size_t len = wxWcslen(psz);
2536 if (buf)
2537 {
2538 for (size_t c = 0; c <= len; c++)
2539 {
2540 if (psz[c] > 0xFF)
2541 return (size_t)-1;
2542 buf[c] = (char)psz[c];
2543 }
2544 }
2545 else
2546 {
2547 for (size_t c = 0; c <= len; c++)
2548 {
2549 if (psz[c] > 0xFF)
2550 return (size_t)-1;
2551 }
2552 }
2553
2554 return len;
2555 }
2556
2557 // ----------------------------------------------------------------------------
2558 // globals
2559 // ----------------------------------------------------------------------------
2560
2561 #ifdef __WINDOWS__
2562 static wxMBConv_win32 wxConvLibcObj;
2563 #elif defined(__WXMAC__) && !defined(__MACH__)
2564 static wxMBConv_mac wxConvLibcObj ;
2565 #else
2566 static wxMBConvLibc wxConvLibcObj;
2567 #endif
2568
2569 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2570 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2571 static wxMBConvUTF7 wxConvUTF7Obj;
2572 static wxMBConvUTF8 wxConvUTF8Obj;
2573
2574
2575 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2576 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2577 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2578 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2579 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2580 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2581
2582 #else // !wxUSE_WCHAR_T
2583
2584 // stand-ins in absence of wchar_t
2585 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2586 wxConvISO8859_1,
2587 wxConvLocal,
2588 wxConvUTF8;
2589
2590 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2591
2592