1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
39 #include "wx/strconv.h"
44 #include "wx/msw/private.h"
48 #include "wx/msw/missing.h"
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
63 // ----------------------------------------------------------------------------
65 // ----------------------------------------------------------------------------
73 #include "wx/thread.h"
76 #include "wx/encconv.h"
77 #include "wx/fontmap.h"
81 #include <ATSUnicode.h>
82 #include <TextCommon.h>
83 #include <TextEncodingConverter.h>
85 #include "wx/mac/private.h" // includes mac headers
87 // ----------------------------------------------------------------------------
89 // ----------------------------------------------------------------------------
91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
94 #if SIZEOF_WCHAR_T == 4
95 #define WC_NAME "UCS4"
96 #define WC_BSWAP BSWAP_UCS4
97 #ifdef WORDS_BIGENDIAN
98 #define WC_NAME_BEST "UCS-4BE"
100 #define WC_NAME_BEST "UCS-4LE"
102 #elif SIZEOF_WCHAR_T == 2
103 #define WC_NAME "UTF16"
104 #define WC_BSWAP BSWAP_UTF16
106 #ifdef WORDS_BIGENDIAN
107 #define WC_NAME_BEST "UTF-16BE"
109 #define WC_NAME_BEST "UTF-16LE"
111 #else // sizeof(wchar_t) != 2 nor 4
112 // does this ever happen?
113 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
116 // ============================================================================
118 // ============================================================================
120 // ----------------------------------------------------------------------------
121 // UTF-16 en/decoding to/from UCS-4
122 // ----------------------------------------------------------------------------
125 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
130 *output
= (wxUint16
) input
;
133 else if (input
>=0x110000)
141 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
142 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
148 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
150 if ((*input
<0xd800) || (*input
>0xdfff))
155 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
162 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
168 // ----------------------------------------------------------------------------
170 // ----------------------------------------------------------------------------
172 wxMBConv::~wxMBConv()
174 // nothing to do here (necessary for Darwin linking probably)
177 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
181 // calculate the length of the buffer needed first
182 size_t nLen
= MB2WC(NULL
, psz
, 0);
183 if ( nLen
!= (size_t)-1 )
185 // now do the actual conversion
186 wxWCharBuffer
buf(nLen
);
187 nLen
= MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NULL
188 if ( nLen
!= (size_t)-1 )
195 wxWCharBuffer
buf((wchar_t *)NULL
);
200 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
204 size_t nLen
= WC2MB(NULL
, pwz
, 0);
205 if ( nLen
!= (size_t)-1 )
207 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
208 nLen
= WC2MB(buf
.data(), pwz
, nLen
+ 4);
209 if ( nLen
!= (size_t)-1 )
216 wxCharBuffer
buf((char *)NULL
);
221 const wxWCharBuffer
wxMBConv::cMB2WC(const char *szString
, size_t nStringLen
, size_t* pOutSize
) const
223 wxASSERT(pOutSize
!= NULL
);
225 const char* szEnd
= szString
+ nStringLen
+ 1;
226 const char* szPos
= szString
;
227 const char* szStart
= szPos
;
229 size_t nActualLength
= 0;
230 size_t nCurrentSize
= nStringLen
; //try normal size first (should never resize?)
232 wxWCharBuffer
theBuffer(nCurrentSize
);
234 //Convert the string until the length() is reached, continuing the
235 //loop every time a null character is reached
236 while(szPos
!= szEnd
)
238 wxASSERT(szPos
< szEnd
); //something is _really_ screwed up if this rings true
240 //Get the length of the current (sub)string
241 size_t nLen
= MB2WC(NULL
, szPos
, 0);
243 //Invalid conversion?
244 if( nLen
== (size_t)-1 )
247 theBuffer
.data()[0u] = wxT('\0');
252 //Increase the actual length (+1 for current null character)
253 nActualLength
+= nLen
+ 1;
255 //if buffer too big, realloc the buffer
256 if (nActualLength
> (nCurrentSize
+1))
258 wxWCharBuffer
theNewBuffer(nCurrentSize
<< 1);
259 memcpy(theNewBuffer
.data(), theBuffer
.data(), nCurrentSize
* sizeof(wchar_t));
260 theBuffer
= theNewBuffer
;
264 //Convert the current (sub)string
265 if ( MB2WC(&theBuffer
.data()[szPos
- szStart
], szPos
, nLen
+ 1) == (size_t)-1 )
268 theBuffer
.data()[0u] = wxT('\0');
272 //Increment to next (sub)string
273 //Note that we have to use strlen here instead of nLen
274 //here because XX2XX gives us the size of the output buffer,
275 //not neccessarly the length of the string
276 szPos
+= strlen(szPos
) + 1;
279 //success - return actual length and the buffer
280 *pOutSize
= nActualLength
;
284 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *szString
, size_t nStringLen
, size_t* pOutSize
) const
286 wxASSERT(pOutSize
!= NULL
);
288 const wchar_t* szEnd
= szString
+ nStringLen
+ 1;
289 const wchar_t* szPos
= szString
;
290 const wchar_t* szStart
= szPos
;
292 size_t nActualLength
= 0;
293 size_t nCurrentSize
= nStringLen
<< 2; //try * 4 first
295 wxCharBuffer
theBuffer(nCurrentSize
);
297 //Convert the string until the length() is reached, continuing the
298 //loop every time a null character is reached
299 while(szPos
!= szEnd
)
301 wxASSERT(szPos
< szEnd
); //something is _really_ screwed up if this rings true
303 //Get the length of the current (sub)string
304 size_t nLen
= WC2MB(NULL
, szPos
, 0);
306 //Invalid conversion?
307 if( nLen
== (size_t)-1 )
310 theBuffer
.data()[0u] = wxT('\0');
314 //Increase the actual length (+1 for current null character)
315 nActualLength
+= nLen
+ 1;
317 //if buffer too big, realloc the buffer
318 if (nActualLength
> (nCurrentSize
+1))
320 wxCharBuffer
theNewBuffer(nCurrentSize
<< 1);
321 memcpy(theNewBuffer
.data(), theBuffer
.data(), nCurrentSize
);
322 theBuffer
= theNewBuffer
;
326 //Convert the current (sub)string
327 if(WC2MB(&theBuffer
.data()[szPos
- szStart
], szPos
, nLen
+ 1) == (size_t)-1 )
330 theBuffer
.data()[0u] = wxT('\0');
334 //Increment to next (sub)string
335 //Note that we have to use wxWcslen here instead of nLen
336 //here because XX2XX gives us the size of the output buffer,
337 //not neccessarly the length of the string
338 szPos
+= wxWcslen(szPos
) + 1;
341 //success - return actual length and the buffer
342 *pOutSize
= nActualLength
;
346 // ----------------------------------------------------------------------------
348 // ----------------------------------------------------------------------------
350 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
352 return wxMB2WC(buf
, psz
, n
);
355 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
357 return wxWC2MB(buf
, psz
, n
);
359 // ----------------------------------------------------------------------------
361 // ----------------------------------------------------------------------------
363 // Implementation (C) 2004 Fredrik Roubert
366 // BASE64 decoding table
368 static const unsigned char utf7unb64
[] =
370 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
371 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
372 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
375 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
376 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
377 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
378 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
379 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
380 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
381 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
382 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
383 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
384 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
385 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
386 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
387 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
388 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
401 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
404 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
408 while (*psz
&& ((!buf
) || (len
< n
)))
410 unsigned char cc
= *psz
++;
418 else if (*psz
== '-')
428 // BASE64 encoded string
432 for (lsb
= false, d
= 0, l
= 0;
433 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff; psz
++)
437 for (l
+= 6; l
>= 8; lsb
= !lsb
)
439 c
= (unsigned char)((d
>> (l
-= 8)) % 256);
448 *buf
= (wchar_t)(c
<< 8);
455 if (buf
&& (len
< n
))
461 // BASE64 encoding table
463 static const unsigned char utf7enb64
[] =
465 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
466 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
467 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
468 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
469 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
470 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
471 'w', 'x', 'y', 'z', '0', '1', '2', '3',
472 '4', '5', '6', '7', '8', '9', '+', '/'
476 // UTF-7 encoding table
478 // 0 - Set D (directly encoded characters)
479 // 1 - Set O (optional direct characters)
480 // 2 - whitespace characters (optional)
481 // 3 - special characters
483 static const unsigned char utf7encode
[128] =
485 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
486 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
487 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
489 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
491 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
495 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
501 while (*psz
&& ((!buf
) || (len
< n
)))
504 if (cc
< 0x80 && utf7encode
[cc
] < 1)
512 else if (((wxUint32
)cc
) > 0xffff)
514 // no surrogate pair generation (yet?)
525 // BASE64 encode string
526 unsigned int lsb
, d
, l
;
527 for (d
= 0, l
= 0;; psz
++)
529 for (lsb
= 0; lsb
< 2; lsb
++)
532 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
534 for (l
+= 8; l
>= 6; )
538 *buf
++ = utf7enb64
[(d
>> l
) % 64];
543 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
549 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
558 if (buf
&& (len
< n
))
563 // ----------------------------------------------------------------------------
565 // ----------------------------------------------------------------------------
567 static wxUint32 utf8_max
[]=
568 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
570 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
574 while (*psz
&& ((!buf
) || (len
< n
)))
576 unsigned char cc
= *psz
++, fc
= cc
;
578 for (cnt
= 0; fc
& 0x80; cnt
++)
592 // invalid UTF-8 sequence
597 unsigned ocnt
= cnt
- 1;
598 wxUint32 res
= cc
& (0x3f >> cnt
);
602 if ((cc
& 0xC0) != 0x80)
604 // invalid UTF-8 sequence
607 res
= (res
<< 6) | (cc
& 0x3f);
609 if (res
<= utf8_max
[ocnt
])
611 // illegal UTF-8 encoding
615 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
616 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
617 if (pa
== (size_t)-1)
626 #endif // WC_UTF16/!WC_UTF16
630 if (buf
&& (len
< n
))
635 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
639 while (*psz
&& ((!buf
) || (len
< n
)))
643 // cast is ok for WC_UTF16
644 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
645 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
647 cc
=(*psz
++) & 0x7fffffff;
650 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
664 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
666 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
671 if (buf
&& (len
<n
)) *buf
= 0;
679 // ----------------------------------------------------------------------------
681 // ----------------------------------------------------------------------------
683 #ifdef WORDS_BIGENDIAN
684 #define wxMBConvUTF16straight wxMBConvUTF16BE
685 #define wxMBConvUTF16swap wxMBConvUTF16LE
687 #define wxMBConvUTF16swap wxMBConvUTF16BE
688 #define wxMBConvUTF16straight wxMBConvUTF16LE
694 // copy 16bit MB to 16bit String
695 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
699 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
702 *buf
++ = *(wxUint16
*)psz
;
705 psz
+= sizeof(wxUint16
);
707 if (buf
&& len
<n
) *buf
=0;
713 // copy 16bit String to 16bit MB
714 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
718 while (*psz
&& (!buf
|| len
< n
))
722 *(wxUint16
*)buf
= *psz
;
723 buf
+= sizeof(wxUint16
);
725 len
+= sizeof(wxUint16
);
728 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
734 // swap 16bit MB to 16bit String
735 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
739 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
743 ((char *)buf
)[0] = psz
[1];
744 ((char *)buf
)[1] = psz
[0];
748 psz
+= sizeof(wxUint16
);
750 if (buf
&& len
<n
) *buf
=0;
756 // swap 16bit MB to 16bit String
757 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
761 while (*psz
&& (!buf
|| len
< n
))
765 *buf
++ = ((char*)psz
)[1];
766 *buf
++ = ((char*)psz
)[0];
768 len
+= sizeof(wxUint16
);
771 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
780 // copy 16bit MB to 32bit String
781 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
785 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
788 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
789 if (pa
== (size_t)-1)
795 psz
+= pa
* sizeof(wxUint16
);
797 if (buf
&& len
<n
) *buf
=0;
803 // copy 32bit String to 16bit MB
804 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
808 while (*psz
&& (!buf
|| len
< n
))
811 size_t pa
=encode_utf16(*psz
, cc
);
813 if (pa
== (size_t)-1)
818 *(wxUint16
*)buf
= cc
[0];
819 buf
+= sizeof(wxUint16
);
822 *(wxUint16
*)buf
= cc
[1];
823 buf
+= sizeof(wxUint16
);
827 len
+= pa
*sizeof(wxUint16
);
830 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
836 // swap 16bit MB to 32bit String
837 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
841 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
845 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
846 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
848 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
849 if (pa
== (size_t)-1)
856 psz
+= pa
* sizeof(wxUint16
);
858 if (buf
&& len
<n
) *buf
=0;
864 // swap 32bit String to 16bit MB
865 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
869 while (*psz
&& (!buf
|| len
< n
))
872 size_t pa
=encode_utf16(*psz
, cc
);
874 if (pa
== (size_t)-1)
879 *buf
++ = ((char*)cc
)[1];
880 *buf
++ = ((char*)cc
)[0];
883 *buf
++ = ((char*)cc
)[3];
884 *buf
++ = ((char*)cc
)[2];
888 len
+= pa
*sizeof(wxUint16
);
891 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
899 // ----------------------------------------------------------------------------
901 // ----------------------------------------------------------------------------
903 #ifdef WORDS_BIGENDIAN
904 #define wxMBConvUTF32straight wxMBConvUTF32BE
905 #define wxMBConvUTF32swap wxMBConvUTF32LE
907 #define wxMBConvUTF32swap wxMBConvUTF32BE
908 #define wxMBConvUTF32straight wxMBConvUTF32LE
912 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
913 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
918 // copy 32bit MB to 16bit String
919 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
923 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
927 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
928 if (pa
== (size_t)-1)
938 psz
+= sizeof(wxUint32
);
940 if (buf
&& len
<n
) *buf
=0;
946 // copy 16bit String to 32bit MB
947 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
951 while (*psz
&& (!buf
|| len
< n
))
955 // cast is ok for WC_UTF16
956 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
957 if (pa
== (size_t)-1)
962 *(wxUint32
*)buf
= cc
;
963 buf
+= sizeof(wxUint32
);
965 len
+= sizeof(wxUint32
);
969 if (buf
&& len
<=n
-sizeof(wxUint32
))
977 // swap 32bit MB to 16bit String
978 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
982 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
985 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
986 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
991 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
992 if (pa
== (size_t)-1)
1002 psz
+= sizeof(wxUint32
);
1012 // swap 16bit String to 32bit MB
1013 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1017 while (*psz
&& (!buf
|| len
< n
))
1021 // cast is ok for WC_UTF16
1022 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
1023 if (pa
== (size_t)-1)
1033 len
+= sizeof(wxUint32
);
1037 if (buf
&& len
<=n
-sizeof(wxUint32
))
1046 // copy 32bit MB to 32bit String
1047 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1051 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1054 *buf
++ = *(wxUint32
*)psz
;
1056 psz
+= sizeof(wxUint32
);
1066 // copy 32bit String to 32bit MB
1067 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1071 while (*psz
&& (!buf
|| len
< n
))
1075 *(wxUint32
*)buf
= *psz
;
1076 buf
+= sizeof(wxUint32
);
1079 len
+= sizeof(wxUint32
);
1083 if (buf
&& len
<=n
-sizeof(wxUint32
))
1090 // swap 32bit MB to 32bit String
1091 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1095 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1099 ((char *)buf
)[0] = psz
[3];
1100 ((char *)buf
)[1] = psz
[2];
1101 ((char *)buf
)[2] = psz
[1];
1102 ((char *)buf
)[3] = psz
[0];
1106 psz
+= sizeof(wxUint32
);
1116 // swap 32bit String to 32bit MB
1117 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1121 while (*psz
&& (!buf
|| len
< n
))
1125 *buf
++ = ((char *)psz
)[3];
1126 *buf
++ = ((char *)psz
)[2];
1127 *buf
++ = ((char *)psz
)[1];
1128 *buf
++ = ((char *)psz
)[0];
1130 len
+= sizeof(wxUint32
);
1134 if (buf
&& len
<=n
-sizeof(wxUint32
))
1144 // ============================================================================
1145 // The classes doing conversion using the iconv_xxx() functions
1146 // ============================================================================
1150 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1151 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1152 // (unless there's yet another bug in glibc) the only case when iconv()
1153 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1154 // left in the input buffer -- when _real_ error occurs,
1155 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1157 // [This bug does not appear in glibc 2.2.]
1158 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1159 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1160 (errno != E2BIG || bufLeft != 0))
1162 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1165 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1167 // ----------------------------------------------------------------------------
1168 // wxMBConv_iconv: encapsulates an iconv character set
1169 // ----------------------------------------------------------------------------
1171 class wxMBConv_iconv
: public wxMBConv
1174 wxMBConv_iconv(const wxChar
*name
);
1175 virtual ~wxMBConv_iconv();
1177 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1178 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1181 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
1184 // the iconv handlers used to translate from multibyte to wide char and in
1185 // the other direction
1189 // guards access to m2w and w2m objects
1190 wxMutex m_iconvMutex
;
1194 // the name (for iconv_open()) of a wide char charset -- if none is
1195 // available on this machine, it will remain NULL
1196 static const char *ms_wcCharsetName
;
1198 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1199 // different endian-ness than the native one
1200 static bool ms_wcNeedsSwap
;
1203 const char *wxMBConv_iconv::ms_wcCharsetName
= NULL
;
1204 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1206 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1208 // Do it the hard way
1210 for (size_t i
= 0; i
< wxStrlen(name
)+1; i
++)
1211 cname
[i
] = (char) name
[i
];
1213 // check for charset that represents wchar_t:
1214 if (ms_wcCharsetName
== NULL
)
1216 ms_wcNeedsSwap
= false;
1218 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1219 ms_wcCharsetName
= WC_NAME_BEST
;
1220 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1222 if (m2w
== (iconv_t
)-1)
1224 // try charset w/o bytesex info (e.g. "UCS4")
1225 // and check for bytesex ourselves:
1226 ms_wcCharsetName
= WC_NAME
;
1227 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1229 // last bet, try if it knows WCHAR_T pseudo-charset
1230 if (m2w
== (iconv_t
)-1)
1232 ms_wcCharsetName
= "WCHAR_T";
1233 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1236 if (m2w
!= (iconv_t
)-1)
1238 char buf
[2], *bufPtr
;
1239 wchar_t wbuf
[2], *wbufPtr
;
1247 outsz
= SIZEOF_WCHAR_T
* 2;
1251 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1252 (char**)&wbufPtr
, &outsz
);
1254 if (ICONV_FAILED(res
, insz
))
1256 ms_wcCharsetName
= NULL
;
1257 wxLogLastError(wxT("iconv"));
1258 wxLogError(_("Conversion to charset '%s' doesn't work."), name
);
1262 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1267 ms_wcCharsetName
= NULL
;
1269 // VS: we must not output an error here, since wxWidgets will safely
1270 // fall back to using wxEncodingConverter.
1271 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name
);
1275 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName
, ms_wcNeedsSwap
);
1277 else // we already have ms_wcCharsetName
1279 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1282 // NB: don't ever pass NULL to iconv_open(), it may crash!
1283 if ( ms_wcCharsetName
)
1285 w2m
= iconv_open( cname
, ms_wcCharsetName
);
1293 wxMBConv_iconv::~wxMBConv_iconv()
1295 if ( m2w
!= (iconv_t
)-1 )
1297 if ( w2m
!= (iconv_t
)-1 )
1301 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1304 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1305 // Unfortunately there is a couple of global wxCSConv objects such as
1306 // wxConvLocal that are used all over wx code, so we have to make sure
1307 // the handle is used by at most one thread at the time. Otherwise
1308 // only a few wx classes would be safe to use from non-main threads
1309 // as MB<->WC conversion would fail "randomly".
1310 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1313 size_t inbuf
= strlen(psz
);
1314 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1316 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1317 wchar_t *bufPtr
= buf
;
1318 const char *pszPtr
= psz
;
1322 // have destination buffer, convert there
1324 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1325 (char**)&bufPtr
, &outbuf
);
1326 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1330 // convert to native endianness
1331 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
1334 // NB: iconv was given only strlen(psz) characters on input, and so
1335 // it couldn't convert the trailing zero. Let's do it ourselves
1336 // if there's some room left for it in the output buffer.
1342 // no destination buffer... convert using temp buffer
1343 // to calculate destination buffer requirement
1348 outbuf
= 8*SIZEOF_WCHAR_T
;
1351 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1352 (char**)&bufPtr
, &outbuf
);
1354 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1355 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1358 if (ICONV_FAILED(cres
, inbuf
))
1360 //VS: it is ok if iconv fails, hence trace only
1361 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1368 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1371 // NB: explained in MB2WC
1372 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1375 size_t inbuf
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1379 wchar_t *tmpbuf
= 0;
1383 // need to copy to temp buffer to switch endianness
1384 // this absolutely doesn't rock!
1385 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1386 // could be in read-only memory, or be accessed in some other thread)
1387 tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
1388 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
1389 WC_BSWAP(tmpbuf
, inbuf
)
1395 // have destination buffer, convert there
1396 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1400 // NB: iconv was given only wcslen(psz) characters on input, and so
1401 // it couldn't convert the trailing zero. Let's do it ourselves
1402 // if there's some room left for it in the output buffer.
1408 // no destination buffer... convert using temp buffer
1409 // to calculate destination buffer requirement
1413 buf
= tbuf
; outbuf
= 16;
1415 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1418 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1426 if (ICONV_FAILED(cres
, inbuf
))
1428 //VS: it is ok if iconv fails, hence trace only
1429 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1436 #endif // HAVE_ICONV
1439 // ============================================================================
1440 // Win32 conversion classes
1441 // ============================================================================
1443 #ifdef wxHAVE_WIN32_MB2WC
1447 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1448 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1451 class wxMBConv_win32
: public wxMBConv
1456 m_CodePage
= CP_ACP
;
1460 wxMBConv_win32(const wxChar
* name
)
1462 m_CodePage
= wxCharsetToCodepage(name
);
1465 wxMBConv_win32(wxFontEncoding encoding
)
1467 m_CodePage
= wxEncodingToCodepage(encoding
);
1471 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1473 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1474 // the behaviour is not compatible with the Unix version (using iconv)
1475 // and break the library itself, e.g. wxTextInputStream::NextChar()
1476 // wouldn't work if reading an incomplete MB char didn't result in an
1479 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1480 // an error (tested under Windows Server 2003) and apparently it is
1481 // done on purpose, i.e. the function accepts any input in this case
1482 // and although I'd prefer to return error on ill-formed output, our
1483 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1484 // explicitly ill-formed according to RFC 2152) neither so we don't
1485 // even have any fallback here...
1486 int flags
= m_CodePage
== CP_UTF7
? 0 : MB_ERR_INVALID_CHARS
;
1488 const size_t len
= ::MultiByteToWideChar
1490 m_CodePage
, // code page
1491 flags
, // flags: fall on error
1492 psz
, // input string
1493 -1, // its length (NUL-terminated)
1494 buf
, // output string
1495 buf
? n
: 0 // size of output buffer
1498 // note that it returns count of written chars for buf != NULL and size
1499 // of the needed buffer for buf == NULL so in either case the length of
1500 // the string (which never includes the terminating NUL) is one less
1501 return len
? len
- 1 : (size_t)-1;
1504 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1507 we have a problem here: by default, WideCharToMultiByte() may
1508 replace characters unrepresentable in the target code page with bad
1509 quality approximations such as turning "1/2" symbol (U+00BD) into
1510 "1" for the code pages which don't have it and we, obviously, want
1511 to avoid this at any price
1513 the trouble is that this function does it _silently_, i.e. it won't
1514 even tell us whether it did or not... Win98/2000 and higher provide
1515 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1516 we have to resort to a round trip, i.e. check that converting back
1517 results in the same string -- this is, of course, expensive but
1518 otherwise we simply can't be sure to not garble the data.
1521 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1522 // it doesn't work with CJK encodings (which we test for rather roughly
1523 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1525 BOOL usedDef
wxDUMMY_INITIALIZE(false);
1528 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1530 // it's our lucky day
1531 flags
= WC_NO_BEST_FIT_CHARS
;
1532 pUsedDef
= &usedDef
;
1534 else // old system or unsupported encoding
1540 const size_t len
= ::WideCharToMultiByte
1542 m_CodePage
, // code page
1543 flags
, // either none or no best fit
1544 pwz
, // input string
1545 -1, // it is (wide) NUL-terminated
1546 buf
, // output buffer
1547 buf
? n
: 0, // and its size
1548 NULL
, // default "replacement" char
1549 pUsedDef
// [out] was it used?
1554 // function totally failed
1558 // if we were really converting, check if we succeeded
1563 // check if the conversion failed, i.e. if any replacements
1568 else // we must resort to double tripping...
1570 wxWCharBuffer
wcBuf(n
);
1571 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1572 wcscmp(wcBuf
, pwz
) != 0 )
1574 // we didn't obtain the same thing we started from, hence
1575 // the conversion was lossy and we consider that it failed
1581 // see the comment above for the reason of "len - 1"
1585 bool IsOk() const { return m_CodePage
!= -1; }
1588 static bool CanUseNoBestFit()
1590 static int s_isWin98Or2k
= -1;
1592 if ( s_isWin98Or2k
== -1 )
1595 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1598 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1602 s_isWin98Or2k
= verMaj
>= 5;
1606 // unknown, be conseravtive by default
1610 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1613 return s_isWin98Or2k
== 1;
1619 #endif // wxHAVE_WIN32_MB2WC
1621 // ============================================================================
1622 // Cocoa conversion classes
1623 // ============================================================================
1625 #if defined(__WXCOCOA__)
1627 // RN: There is no UTF-32 support in either Core Foundation or
1628 // Cocoa. Strangely enough, internally Core Foundation uses
1629 // UTF 32 internally quite a bit - its just not public (yet).
1631 #include <CoreFoundation/CFString.h>
1632 #include <CoreFoundation/CFStringEncodingExt.h>
1634 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
1636 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
1637 if ( encoding
== wxFONTENCODING_DEFAULT
)
1639 enc
= CFStringGetSystemEncoding();
1641 else switch( encoding
)
1643 case wxFONTENCODING_ISO8859_1
:
1644 enc
= kCFStringEncodingISOLatin1
;
1646 case wxFONTENCODING_ISO8859_2
:
1647 enc
= kCFStringEncodingISOLatin2
;
1649 case wxFONTENCODING_ISO8859_3
:
1650 enc
= kCFStringEncodingISOLatin3
;
1652 case wxFONTENCODING_ISO8859_4
:
1653 enc
= kCFStringEncodingISOLatin4
;
1655 case wxFONTENCODING_ISO8859_5
:
1656 enc
= kCFStringEncodingISOLatinCyrillic
;
1658 case wxFONTENCODING_ISO8859_6
:
1659 enc
= kCFStringEncodingISOLatinArabic
;
1661 case wxFONTENCODING_ISO8859_7
:
1662 enc
= kCFStringEncodingISOLatinGreek
;
1664 case wxFONTENCODING_ISO8859_8
:
1665 enc
= kCFStringEncodingISOLatinHebrew
;
1667 case wxFONTENCODING_ISO8859_9
:
1668 enc
= kCFStringEncodingISOLatin5
;
1670 case wxFONTENCODING_ISO8859_10
:
1671 enc
= kCFStringEncodingISOLatin6
;
1673 case wxFONTENCODING_ISO8859_11
:
1674 enc
= kCFStringEncodingISOLatinThai
;
1676 case wxFONTENCODING_ISO8859_13
:
1677 enc
= kCFStringEncodingISOLatin7
;
1679 case wxFONTENCODING_ISO8859_14
:
1680 enc
= kCFStringEncodingISOLatin8
;
1682 case wxFONTENCODING_ISO8859_15
:
1683 enc
= kCFStringEncodingISOLatin9
;
1686 case wxFONTENCODING_KOI8
:
1687 enc
= kCFStringEncodingKOI8_R
;
1689 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
1690 enc
= kCFStringEncodingDOSRussian
;
1693 // case wxFONTENCODING_BULGARIAN :
1697 case wxFONTENCODING_CP437
:
1698 enc
=kCFStringEncodingDOSLatinUS
;
1700 case wxFONTENCODING_CP850
:
1701 enc
= kCFStringEncodingDOSLatin1
;
1703 case wxFONTENCODING_CP852
:
1704 enc
= kCFStringEncodingDOSLatin2
;
1706 case wxFONTENCODING_CP855
:
1707 enc
= kCFStringEncodingDOSCyrillic
;
1709 case wxFONTENCODING_CP866
:
1710 enc
=kCFStringEncodingDOSRussian
;
1712 case wxFONTENCODING_CP874
:
1713 enc
= kCFStringEncodingDOSThai
;
1715 case wxFONTENCODING_CP932
:
1716 enc
= kCFStringEncodingDOSJapanese
;
1718 case wxFONTENCODING_CP936
:
1719 enc
=kCFStringEncodingDOSChineseSimplif
;
1721 case wxFONTENCODING_CP949
:
1722 enc
= kCFStringEncodingDOSKorean
;
1724 case wxFONTENCODING_CP950
:
1725 enc
= kCFStringEncodingDOSChineseTrad
;
1727 case wxFONTENCODING_CP1250
:
1728 enc
= kCFStringEncodingWindowsLatin2
;
1730 case wxFONTENCODING_CP1251
:
1731 enc
=kCFStringEncodingWindowsCyrillic
;
1733 case wxFONTENCODING_CP1252
:
1734 enc
=kCFStringEncodingWindowsLatin1
;
1736 case wxFONTENCODING_CP1253
:
1737 enc
= kCFStringEncodingWindowsGreek
;
1739 case wxFONTENCODING_CP1254
:
1740 enc
= kCFStringEncodingWindowsLatin5
;
1742 case wxFONTENCODING_CP1255
:
1743 enc
=kCFStringEncodingWindowsHebrew
;
1745 case wxFONTENCODING_CP1256
:
1746 enc
=kCFStringEncodingWindowsArabic
;
1748 case wxFONTENCODING_CP1257
:
1749 enc
= kCFStringEncodingWindowsBalticRim
;
1751 // This only really encodes to UTF7 (if that) evidently
1752 // case wxFONTENCODING_UTF7 :
1753 // enc = kCFStringEncodingNonLossyASCII ;
1755 case wxFONTENCODING_UTF8
:
1756 enc
= kCFStringEncodingUTF8
;
1758 case wxFONTENCODING_EUC_JP
:
1759 enc
= kCFStringEncodingEUC_JP
;
1761 case wxFONTENCODING_UTF16
:
1762 enc
= kCFStringEncodingUnicode
;
1764 case wxFONTENCODING_MACROMAN
:
1765 enc
= kCFStringEncodingMacRoman
;
1767 case wxFONTENCODING_MACJAPANESE
:
1768 enc
= kCFStringEncodingMacJapanese
;
1770 case wxFONTENCODING_MACCHINESETRAD
:
1771 enc
= kCFStringEncodingMacChineseTrad
;
1773 case wxFONTENCODING_MACKOREAN
:
1774 enc
= kCFStringEncodingMacKorean
;
1776 case wxFONTENCODING_MACARABIC
:
1777 enc
= kCFStringEncodingMacArabic
;
1779 case wxFONTENCODING_MACHEBREW
:
1780 enc
= kCFStringEncodingMacHebrew
;
1782 case wxFONTENCODING_MACGREEK
:
1783 enc
= kCFStringEncodingMacGreek
;
1785 case wxFONTENCODING_MACCYRILLIC
:
1786 enc
= kCFStringEncodingMacCyrillic
;
1788 case wxFONTENCODING_MACDEVANAGARI
:
1789 enc
= kCFStringEncodingMacDevanagari
;
1791 case wxFONTENCODING_MACGURMUKHI
:
1792 enc
= kCFStringEncodingMacGurmukhi
;
1794 case wxFONTENCODING_MACGUJARATI
:
1795 enc
= kCFStringEncodingMacGujarati
;
1797 case wxFONTENCODING_MACORIYA
:
1798 enc
= kCFStringEncodingMacOriya
;
1800 case wxFONTENCODING_MACBENGALI
:
1801 enc
= kCFStringEncodingMacBengali
;
1803 case wxFONTENCODING_MACTAMIL
:
1804 enc
= kCFStringEncodingMacTamil
;
1806 case wxFONTENCODING_MACTELUGU
:
1807 enc
= kCFStringEncodingMacTelugu
;
1809 case wxFONTENCODING_MACKANNADA
:
1810 enc
= kCFStringEncodingMacKannada
;
1812 case wxFONTENCODING_MACMALAJALAM
:
1813 enc
= kCFStringEncodingMacMalayalam
;
1815 case wxFONTENCODING_MACSINHALESE
:
1816 enc
= kCFStringEncodingMacSinhalese
;
1818 case wxFONTENCODING_MACBURMESE
:
1819 enc
= kCFStringEncodingMacBurmese
;
1821 case wxFONTENCODING_MACKHMER
:
1822 enc
= kCFStringEncodingMacKhmer
;
1824 case wxFONTENCODING_MACTHAI
:
1825 enc
= kCFStringEncodingMacThai
;
1827 case wxFONTENCODING_MACLAOTIAN
:
1828 enc
= kCFStringEncodingMacLaotian
;
1830 case wxFONTENCODING_MACGEORGIAN
:
1831 enc
= kCFStringEncodingMacGeorgian
;
1833 case wxFONTENCODING_MACARMENIAN
:
1834 enc
= kCFStringEncodingMacArmenian
;
1836 case wxFONTENCODING_MACCHINESESIMP
:
1837 enc
= kCFStringEncodingMacChineseSimp
;
1839 case wxFONTENCODING_MACTIBETAN
:
1840 enc
= kCFStringEncodingMacTibetan
;
1842 case wxFONTENCODING_MACMONGOLIAN
:
1843 enc
= kCFStringEncodingMacMongolian
;
1845 case wxFONTENCODING_MACETHIOPIC
:
1846 enc
= kCFStringEncodingMacEthiopic
;
1848 case wxFONTENCODING_MACCENTRALEUR
:
1849 enc
= kCFStringEncodingMacCentralEurRoman
;
1851 case wxFONTENCODING_MACVIATNAMESE
:
1852 enc
= kCFStringEncodingMacVietnamese
;
1854 case wxFONTENCODING_MACARABICEXT
:
1855 enc
= kCFStringEncodingMacExtArabic
;
1857 case wxFONTENCODING_MACSYMBOL
:
1858 enc
= kCFStringEncodingMacSymbol
;
1860 case wxFONTENCODING_MACDINGBATS
:
1861 enc
= kCFStringEncodingMacDingbats
;
1863 case wxFONTENCODING_MACTURKISH
:
1864 enc
= kCFStringEncodingMacTurkish
;
1866 case wxFONTENCODING_MACCROATIAN
:
1867 enc
= kCFStringEncodingMacCroatian
;
1869 case wxFONTENCODING_MACICELANDIC
:
1870 enc
= kCFStringEncodingMacIcelandic
;
1872 case wxFONTENCODING_MACROMANIAN
:
1873 enc
= kCFStringEncodingMacRomanian
;
1875 case wxFONTENCODING_MACCELTIC
:
1876 enc
= kCFStringEncodingMacCeltic
;
1878 case wxFONTENCODING_MACGAELIC
:
1879 enc
= kCFStringEncodingMacGaelic
;
1881 // case wxFONTENCODING_MACKEYBOARD :
1882 // enc = kCFStringEncodingMacKeyboardGlyphs ;
1885 // because gcc is picky
1891 class wxMBConv_cocoa
: public wxMBConv
1896 Init(CFStringGetSystemEncoding()) ;
1900 wxMBConv_cocoa(const wxChar
* name
)
1902 Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, false) ) ) ;
1906 wxMBConv_cocoa(wxFontEncoding encoding
)
1908 Init( wxCFStringEncFromFontEnc(encoding
) );
1915 void Init( CFStringEncoding encoding
)
1917 m_encoding
= encoding
;
1920 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
1924 CFStringRef theString
= CFStringCreateWithBytes (
1925 NULL
, //the allocator
1926 (const UInt8
*)szUnConv
,
1929 false //no BOM/external representation
1932 wxASSERT(theString
);
1934 size_t nOutLength
= CFStringGetLength(theString
);
1938 CFRelease(theString
);
1942 CFRange theRange
= { 0, nOutSize
};
1944 #if SIZEOF_WCHAR_T == 4
1945 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
1948 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
1950 CFRelease(theString
);
1952 szUniCharBuffer
[nOutLength
] = '\0' ;
1954 #if SIZEOF_WCHAR_T == 4
1955 wxMBConvUTF16 converter
;
1956 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
1957 delete[] szUniCharBuffer
;
1963 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
1967 size_t nRealOutSize
;
1968 size_t nBufSize
= wxWcslen(szUnConv
);
1969 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
1971 #if SIZEOF_WCHAR_T == 4
1972 wxMBConvUTF16BE converter
;
1973 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
1974 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
1975 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
1976 nBufSize
/= sizeof(UniChar
);
1979 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
1983 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
1986 wxASSERT(theString
);
1988 //Note that CER puts a BOM when converting to unicode
1989 //so we check and use getchars instead in that case
1990 if (m_encoding
== kCFStringEncodingUnicode
)
1993 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
1995 nRealOutSize
= CFStringGetLength(theString
) + 1;
2001 CFRangeMake(0, CFStringGetLength(theString
)),
2003 0, //what to put in characters that can't be converted -
2004 //0 tells CFString to return NULL if it meets such a character
2005 false, //not an external representation
2008 (CFIndex
*) &nRealOutSize
2012 CFRelease(theString
);
2014 #if SIZEOF_WCHAR_T == 4
2015 delete[] szUniBuffer
;
2018 return nRealOutSize
- 1;
2023 return m_encoding
!= kCFStringEncodingInvalidId
&&
2024 CFStringIsEncodingAvailable(m_encoding
);
2028 CFStringEncoding m_encoding
;
2031 #endif // defined(__WXCOCOA__)
2033 // ============================================================================
2034 // Mac conversion classes
2035 // ============================================================================
2037 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2039 class wxMBConv_mac
: public wxMBConv
2044 Init(CFStringGetSystemEncoding()) ;
2048 wxMBConv_mac(const wxChar
* name
)
2050 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, false) ) ) ;
2054 wxMBConv_mac(wxFontEncoding encoding
)
2056 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2061 OSStatus status
= noErr
;
2062 status
= TECDisposeConverter(m_MB2WC_converter
);
2063 status
= TECDisposeConverter(m_WC2MB_converter
);
2067 void Init( TextEncodingBase encoding
)
2069 OSStatus status
= noErr
;
2070 m_char_encoding
= encoding
;
2071 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2073 status
= TECCreateConverter(&m_MB2WC_converter
,
2075 m_unicode_encoding
);
2076 status
= TECCreateConverter(&m_WC2MB_converter
,
2081 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2083 OSStatus status
= noErr
;
2084 ByteCount byteOutLen
;
2085 ByteCount byteInLen
= strlen(psz
) ;
2086 wchar_t *tbuf
= NULL
;
2087 UniChar
* ubuf
= NULL
;
2092 //apple specs say at least 32
2093 n
= wxMax( 32 , byteInLen
) ;
2094 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2096 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2097 #if SIZEOF_WCHAR_T == 4
2098 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2100 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2102 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2103 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2104 #if SIZEOF_WCHAR_T == 4
2105 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2106 // is not properly terminated we get random characters at the end
2107 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2108 wxMBConvUTF16BE converter
;
2109 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2112 res
= byteOutLen
/ sizeof( UniChar
) ;
2117 if ( buf
&& res
< n
)
2123 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2125 OSStatus status
= noErr
;
2126 ByteCount byteOutLen
;
2127 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2133 //apple specs say at least 32
2134 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2135 tbuf
= (char*) malloc( n
) ;
2138 ByteCount byteBufferLen
= n
;
2139 UniChar
* ubuf
= NULL
;
2140 #if SIZEOF_WCHAR_T == 4
2141 wxMBConvUTF16BE converter
;
2142 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2143 byteInLen
= unicharlen
;
2144 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2145 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2147 ubuf
= (UniChar
*) psz
;
2149 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2150 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2151 #if SIZEOF_WCHAR_T == 4
2157 size_t res
= byteOutLen
;
2158 if ( buf
&& res
< n
)
2162 //we need to double-trip to verify it didn't insert any ? in place
2163 //of bogus characters
2164 wxWCharBuffer
wcBuf(n
);
2165 size_t pszlen
= wxWcslen(psz
);
2166 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2167 wxWcslen(wcBuf
) != pszlen
||
2168 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2170 // we didn't obtain the same thing we started from, hence
2171 // the conversion was lossy and we consider that it failed
2180 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2183 TECObjectRef m_MB2WC_converter
;
2184 TECObjectRef m_WC2MB_converter
;
2186 TextEncodingBase m_char_encoding
;
2187 TextEncodingBase m_unicode_encoding
;
2190 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2192 // ============================================================================
2193 // wxEncodingConverter based conversion classes
2194 // ============================================================================
2198 class wxMBConv_wxwin
: public wxMBConv
2203 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2204 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2208 // temporarily just use wxEncodingConverter stuff,
2209 // so that it works while a better implementation is built
2210 wxMBConv_wxwin(const wxChar
* name
)
2213 m_enc
= wxFontMapper::Get()->CharsetToEncoding(name
, false);
2215 m_enc
= wxFONTENCODING_SYSTEM
;
2220 wxMBConv_wxwin(wxFontEncoding enc
)
2227 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2229 size_t inbuf
= strlen(psz
);
2232 if (!m2w
.Convert(psz
,buf
))
2238 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2240 const size_t inbuf
= wxWcslen(psz
);
2243 if (!w2m
.Convert(psz
,buf
))
2250 bool IsOk() const { return m_ok
; }
2253 wxFontEncoding m_enc
;
2254 wxEncodingConverter m2w
, w2m
;
2256 // were we initialized successfully?
2259 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2262 #endif // wxUSE_FONTMAP
2264 // ============================================================================
2265 // wxCSConv implementation
2266 // ============================================================================
2268 void wxCSConv::Init()
2275 wxCSConv::wxCSConv(const wxChar
*charset
)
2284 m_encoding
= wxFONTENCODING_SYSTEM
;
2287 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2289 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2291 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2293 encoding
= wxFONTENCODING_SYSTEM
;
2298 m_encoding
= encoding
;
2301 wxCSConv::~wxCSConv()
2306 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2311 SetName(conv
.m_name
);
2312 m_encoding
= conv
.m_encoding
;
2315 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2319 SetName(conv
.m_name
);
2320 m_encoding
= conv
.m_encoding
;
2325 void wxCSConv::Clear()
2334 void wxCSConv::SetName(const wxChar
*charset
)
2338 m_name
= wxStrdup(charset
);
2343 wxMBConv
*wxCSConv::DoCreate() const
2345 // check for the special case of ASCII or ISO8859-1 charset: as we have
2346 // special knowledge of it anyhow, we don't need to create a special
2347 // conversion object
2348 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
2350 // don't convert at all
2354 // we trust OS to do conversion better than we can so try external
2355 // conversion methods first
2357 // the full order is:
2358 // 1. OS conversion (iconv() under Unix or Win32 API)
2359 // 2. hard coded conversions for UTF
2360 // 3. wxEncodingConverter as fall back
2366 #endif // !wxUSE_FONTMAP
2368 wxString
name(m_name
);
2372 name
= wxFontMapper::Get()->GetEncodingName(m_encoding
);
2373 #endif // wxUSE_FONTMAP
2375 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
2381 #endif // HAVE_ICONV
2383 #ifdef wxHAVE_WIN32_MB2WC
2386 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2387 : new wxMBConv_win32(m_encoding
);
2396 #endif // wxHAVE_WIN32_MB2WC
2397 #if defined(__WXMAC__)
2399 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
) )
2403 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
2404 : new wxMBConv_mac(m_encoding
);
2406 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
2415 #if defined(__WXCOCOA__)
2417 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
2421 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
2422 : new wxMBConv_cocoa(m_encoding
);
2424 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
2434 wxFontEncoding enc
= m_encoding
;
2436 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2438 // use "false" to suppress interactive dialogs -- we can be called from
2439 // anywhere and popping up a dialog from here is the last thing we want to
2441 enc
= wxFontMapper::Get()->CharsetToEncoding(m_name
, false);
2443 #endif // wxUSE_FONTMAP
2447 case wxFONTENCODING_UTF7
:
2448 return new wxMBConvUTF7
;
2450 case wxFONTENCODING_UTF8
:
2451 return new wxMBConvUTF8
;
2453 case wxFONTENCODING_UTF16BE
:
2454 return new wxMBConvUTF16BE
;
2456 case wxFONTENCODING_UTF16LE
:
2457 return new wxMBConvUTF16LE
;
2459 case wxFONTENCODING_UTF32BE
:
2460 return new wxMBConvUTF32BE
;
2462 case wxFONTENCODING_UTF32LE
:
2463 return new wxMBConvUTF32LE
;
2466 // nothing to do but put here to suppress gcc warnings
2473 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2474 : new wxMBConv_wxwin(m_encoding
);
2480 #endif // wxUSE_FONTMAP
2482 // NB: This is a hack to prevent deadlock. What could otherwise happen
2483 // in Unicode build: wxConvLocal creation ends up being here
2484 // because of some failure and logs the error. But wxLog will try to
2485 // attach timestamp, for which it will need wxConvLocal (to convert
2486 // time to char* and then wchar_t*), but that fails, tries to log
2487 // error, but wxLog has a (already locked) critical section that
2488 // guards static buffer.
2489 static bool alreadyLoggingError
= false;
2490 if (!alreadyLoggingError
)
2492 alreadyLoggingError
= true;
2493 wxLogError(_("Cannot convert from the charset '%s'!"),
2497 wxFontMapper::GetEncodingDescription(m_encoding
).c_str()
2498 #else // !wxUSE_FONTMAP
2499 wxString::Format(_("encoding %s"), m_encoding
).c_str()
2500 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2502 alreadyLoggingError
= false;
2508 void wxCSConv::CreateConvIfNeeded() const
2512 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
2515 // if we don't have neither the name nor the encoding, use the default
2516 // encoding for this system
2517 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
2519 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
2521 #endif // wxUSE_INTL
2523 self
->m_convReal
= DoCreate();
2524 self
->m_deferred
= false;
2528 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2530 CreateConvIfNeeded();
2533 return m_convReal
->MB2WC(buf
, psz
, n
);
2536 size_t len
= strlen(psz
);
2540 for (size_t c
= 0; c
<= len
; c
++)
2541 buf
[c
] = (unsigned char)(psz
[c
]);
2547 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2549 CreateConvIfNeeded();
2552 return m_convReal
->WC2MB(buf
, psz
, n
);
2555 const size_t len
= wxWcslen(psz
);
2558 for (size_t c
= 0; c
<= len
; c
++)
2562 buf
[c
] = (char)psz
[c
];
2567 for (size_t c
= 0; c
<= len
; c
++)
2577 // ----------------------------------------------------------------------------
2579 // ----------------------------------------------------------------------------
2582 static wxMBConv_win32 wxConvLibcObj
;
2583 #elif defined(__WXMAC__) && !defined(__MACH__)
2584 static wxMBConv_mac wxConvLibcObj
;
2586 static wxMBConvLibc wxConvLibcObj
;
2589 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
2590 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
2591 static wxMBConvUTF7 wxConvUTF7Obj
;
2592 static wxMBConvUTF8 wxConvUTF8Obj
;
2595 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
2596 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
2597 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
2598 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
2599 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
2600 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
2602 #else // !wxUSE_WCHAR_T
2604 // stand-ins in absence of wchar_t
2605 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
2610 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T