1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
39 #include "wx/strconv.h"
44 #include "wx/msw/private.h"
48 #include "wx/msw/missing.h"
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
63 // ----------------------------------------------------------------------------
65 // ----------------------------------------------------------------------------
75 #include "wx/encconv.h"
76 #include "wx/fontmap.h"
80 #include <ATSUnicode.h>
81 #include <TextCommon.h>
82 #include <TextEncodingConverter.h>
84 #include "wx/mac/private.h" // includes mac headers
86 // ----------------------------------------------------------------------------
88 // ----------------------------------------------------------------------------
90 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
91 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
93 #if SIZEOF_WCHAR_T == 4
94 #define WC_NAME "UCS4"
95 #define WC_BSWAP BSWAP_UCS4
96 #ifdef WORDS_BIGENDIAN
97 #define WC_NAME_BEST "UCS-4BE"
99 #define WC_NAME_BEST "UCS-4LE"
101 #elif SIZEOF_WCHAR_T == 2
102 #define WC_NAME "UTF16"
103 #define WC_BSWAP BSWAP_UTF16
105 #ifdef WORDS_BIGENDIAN
106 #define WC_NAME_BEST "UTF-16BE"
108 #define WC_NAME_BEST "UTF-16LE"
110 #else // sizeof(wchar_t) != 2 nor 4
111 // does this ever happen?
112 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
115 // ============================================================================
117 // ============================================================================
119 // ----------------------------------------------------------------------------
120 // UTF-16 en/decoding to/from UCS-4
121 // ----------------------------------------------------------------------------
124 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
129 *output
= (wxUint16
) input
;
132 else if (input
>=0x110000)
140 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
141 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
147 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
149 if ((*input
<0xd800) || (*input
>0xdfff))
154 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
161 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
167 // ----------------------------------------------------------------------------
169 // ----------------------------------------------------------------------------
171 wxMBConv::~wxMBConv()
173 // nothing to do here (necessary for Darwin linking probably)
176 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
180 // calculate the length of the buffer needed first
181 size_t nLen
= MB2WC(NULL
, psz
, 0);
182 if ( nLen
!= (size_t)-1 )
184 // now do the actual conversion
185 wxWCharBuffer
buf(nLen
);
186 nLen
= MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NULL
187 if ( nLen
!= (size_t)-1 )
194 wxWCharBuffer
buf((wchar_t *)NULL
);
199 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
203 size_t nLen
= WC2MB(NULL
, pwz
, 0);
204 if ( nLen
!= (size_t)-1 )
206 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
207 nLen
= WC2MB(buf
.data(), pwz
, nLen
+ 4);
208 if ( nLen
!= (size_t)-1 )
215 wxCharBuffer
buf((char *)NULL
);
220 size_t wxMBConv::MB2WC(wchar_t* szBuffer
, const char* szString
,
221 size_t outsize
, size_t nStringLen
) const
223 const char* szEnd
= szString
+ nStringLen
+ 1;
224 const char* szPos
= szString
;
225 const char* szStart
= szPos
;
227 size_t nActualLength
= 0;
229 //Convert the string until the length() is reached, continuing the
230 //loop every time a null character is reached
231 while(szPos
!= szEnd
)
233 wxASSERT(szPos
< szEnd
); //something is _really_ screwed up if this rings true
235 //Get the length of the current (sub)string
236 size_t nLen
= MB2WC(NULL
, szPos
, 0);
238 //Invalid conversion?
239 if( nLen
== (size_t)-1 )
242 //Increase the actual length (+1 for current null character)
243 nActualLength
+= nLen
+ 1;
245 //Only copy data in if buffer size is big enough
246 if (szBuffer
!= NULL
&&
247 nActualLength
<= outsize
)
249 //Convert the current (sub)string
250 if ( MB2WC(&szBuffer
[szPos
- szStart
], szPos
, nLen
+ 1) == (size_t)-1 )
254 //Increment to next (sub)string
255 //Note that we have to use strlen here instead of nLen
256 //here because XX2XX gives us the size of the output buffer,
257 //not neccessarly the length of the string
258 szPos
+= strlen(szPos
) + 1;
261 return nActualLength
- 1; //success - return actual length
264 size_t wxMBConv::WC2MB(char* szBuffer
, const wchar_t* szString
,
265 size_t outsize
, size_t nStringLen
) const
267 const wchar_t* szEnd
= szString
+ nStringLen
+ 1;
268 const wchar_t* szPos
= szString
;
269 const wchar_t* szStart
= szPos
;
271 size_t nActualLength
= 0;
273 //Convert the string until the length() is reached, continuing the
274 //loop every time a null character is reached
275 while(szPos
!= szEnd
)
277 wxASSERT(szPos
< szEnd
); //something is _really_ screwed up if this rings true
279 //Get the length of the current (sub)string
280 size_t nLen
= WC2MB(NULL
, szPos
, 0);
282 //Invalid conversion?
283 if( nLen
== (size_t)-1 )
286 //Increase the actual length (+1 for current null character)
287 nActualLength
+= nLen
+ 1;
289 //Only copy data in if buffer size is big enough
290 if (szBuffer
!= NULL
&&
291 nActualLength
<= outsize
)
293 //Convert the current (sub)string
294 if(WC2MB(&szBuffer
[szPos
- szStart
], szPos
, nLen
+ 1) == (size_t)-1 )
298 //Increment to next (sub)string
299 //Note that we have to use wxWcslen here instead of nLen
300 //here because XX2XX gives us the size of the output buffer,
301 //not neccessarly the length of the string
302 szPos
+= wxWcslen(szPos
) + 1;
305 return nActualLength
- 1; //success - return actual length
308 // ----------------------------------------------------------------------------
310 // ----------------------------------------------------------------------------
312 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
314 return wxMB2WC(buf
, psz
, n
);
317 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
319 return wxWC2MB(buf
, psz
, n
);
321 // ----------------------------------------------------------------------------
323 // ----------------------------------------------------------------------------
325 // Implementation (C) 2004 Fredrik Roubert
328 // BASE64 decoding table
330 static const unsigned char utf7unb64
[] =
332 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
333 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
334 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
335 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
336 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
337 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
338 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
339 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
340 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
341 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
342 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
343 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
344 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
345 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
346 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
347 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
348 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
349 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
350 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
351 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
352 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
353 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
354 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
355 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
356 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
357 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
358 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
359 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
360 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
361 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
362 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
363 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
366 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
371 while (*psz
&& ((!buf
) || (len
< n
)))
373 unsigned char cc
= *psz
++;
381 else if (*psz
== '-')
391 // BASE64 encoded string
395 for (lsb
= false, d
= 0, l
= 0;
396 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff; psz
++)
400 for (l
+= 6; l
>= 8; lsb
= !lsb
)
402 c
= (d
>> (l
-= 8)) % 256;
418 if (buf
&& (len
< n
))
424 // BASE64 encoding table
426 static const unsigned char utf7enb64
[] =
428 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
429 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
430 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
431 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
432 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
433 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
434 'w', 'x', 'y', 'z', '0', '1', '2', '3',
435 '4', '5', '6', '7', '8', '9', '+', '/'
439 // UTF-7 encoding table
441 // 0 - Set D (directly encoded characters)
442 // 1 - Set O (optional direct characters)
443 // 2 - whitespace characters (optional)
444 // 3 - special characters
446 static const unsigned char utf7encode
[128] =
448 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
449 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
450 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
451 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
452 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
454 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
455 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
458 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t
459 *psz
, size_t n
) const
465 while (*psz
&& ((!buf
) || (len
< n
)))
468 if (cc
< 0x80 && utf7encode
[cc
] < 1)
477 else if (cc
> 0xffff)
479 else if (cc
> ((const wchar_t)0xffff))
482 // no surrogate pair generation (yet?)
493 // BASE64 encode string
494 unsigned int lsb
, d
, l
;
495 for (d
= 0, l
= 0;; psz
++)
497 for (lsb
= 0; lsb
< 2; lsb
++)
500 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
502 for (l
+= 8; l
>= 6; )
506 *buf
++ = utf7enb64
[(d
>> l
) % 64];
511 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
517 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
526 if (buf
&& (len
< n
))
531 // ----------------------------------------------------------------------------
533 // ----------------------------------------------------------------------------
535 static wxUint32 utf8_max
[]=
536 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
538 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
542 while (*psz
&& ((!buf
) || (len
< n
)))
544 unsigned char cc
= *psz
++, fc
= cc
;
546 for (cnt
= 0; fc
& 0x80; cnt
++)
560 // invalid UTF-8 sequence
565 unsigned ocnt
= cnt
- 1;
566 wxUint32 res
= cc
& (0x3f >> cnt
);
570 if ((cc
& 0xC0) != 0x80)
572 // invalid UTF-8 sequence
575 res
= (res
<< 6) | (cc
& 0x3f);
577 if (res
<= utf8_max
[ocnt
])
579 // illegal UTF-8 encoding
583 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
584 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
585 if (pa
== (size_t)-1)
594 #endif // WC_UTF16/!WC_UTF16
598 if (buf
&& (len
< n
))
603 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
607 while (*psz
&& ((!buf
) || (len
< n
)))
611 // cast is ok for WC_UTF16
612 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
613 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
615 cc
=(*psz
++) & 0x7fffffff;
618 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
632 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
634 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
639 if (buf
&& (len
<n
)) *buf
= 0;
647 // ----------------------------------------------------------------------------
649 // ----------------------------------------------------------------------------
651 #ifdef WORDS_BIGENDIAN
652 #define wxMBConvUTF16straight wxMBConvUTF16BE
653 #define wxMBConvUTF16swap wxMBConvUTF16LE
655 #define wxMBConvUTF16swap wxMBConvUTF16BE
656 #define wxMBConvUTF16straight wxMBConvUTF16LE
662 // copy 16bit MB to 16bit String
663 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
667 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
670 *buf
++ = *(wxUint16
*)psz
;
673 psz
+= sizeof(wxUint16
);
675 if (buf
&& len
<n
) *buf
=0;
681 // copy 16bit String to 16bit MB
682 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
686 while (*psz
&& (!buf
|| len
< n
))
690 *(wxUint16
*)buf
= *psz
;
691 buf
+= sizeof(wxUint16
);
693 len
+= sizeof(wxUint16
);
696 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
702 // swap 16bit MB to 16bit String
703 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
707 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
711 ((char *)buf
)[0] = psz
[1];
712 ((char *)buf
)[1] = psz
[0];
716 psz
+= sizeof(wxUint16
);
718 if (buf
&& len
<n
) *buf
=0;
724 // swap 16bit MB to 16bit String
725 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
729 while (*psz
&& (!buf
|| len
< n
))
733 *buf
++ = ((char*)psz
)[1];
734 *buf
++ = ((char*)psz
)[0];
736 len
+= sizeof(wxUint16
);
739 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
748 // copy 16bit MB to 32bit String
749 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
753 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
756 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
757 if (pa
== (size_t)-1)
763 psz
+= pa
* sizeof(wxUint16
);
765 if (buf
&& len
<n
) *buf
=0;
771 // copy 32bit String to 16bit MB
772 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
776 while (*psz
&& (!buf
|| len
< n
))
779 size_t pa
=encode_utf16(*psz
, cc
);
781 if (pa
== (size_t)-1)
786 *(wxUint16
*)buf
= cc
[0];
787 buf
+= sizeof(wxUint16
);
790 *(wxUint16
*)buf
= cc
[1];
791 buf
+= sizeof(wxUint16
);
795 len
+= pa
*sizeof(wxUint16
);
798 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
804 // swap 16bit MB to 32bit String
805 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
809 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
813 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
814 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
816 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
817 if (pa
== (size_t)-1)
824 psz
+= pa
* sizeof(wxUint16
);
826 if (buf
&& len
<n
) *buf
=0;
832 // swap 32bit String to 16bit MB
833 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
837 while (*psz
&& (!buf
|| len
< n
))
840 size_t pa
=encode_utf16(*psz
, cc
);
842 if (pa
== (size_t)-1)
847 *buf
++ = ((char*)cc
)[1];
848 *buf
++ = ((char*)cc
)[0];
851 *buf
++ = ((char*)cc
)[3];
852 *buf
++ = ((char*)cc
)[2];
856 len
+= pa
*sizeof(wxUint16
);
859 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
867 // ----------------------------------------------------------------------------
869 // ----------------------------------------------------------------------------
871 #ifdef WORDS_BIGENDIAN
872 #define wxMBConvUTF32straight wxMBConvUTF32BE
873 #define wxMBConvUTF32swap wxMBConvUTF32LE
875 #define wxMBConvUTF32swap wxMBConvUTF32BE
876 #define wxMBConvUTF32straight wxMBConvUTF32LE
880 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
881 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
886 // copy 32bit MB to 16bit String
887 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
891 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
895 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
896 if (pa
== (size_t)-1)
906 psz
+= sizeof(wxUint32
);
908 if (buf
&& len
<n
) *buf
=0;
914 // copy 16bit String to 32bit MB
915 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
919 while (*psz
&& (!buf
|| len
< n
))
923 // cast is ok for WC_UTF16
924 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
925 if (pa
== (size_t)-1)
930 *(wxUint32
*)buf
= cc
;
931 buf
+= sizeof(wxUint32
);
933 len
+= sizeof(wxUint32
);
937 if (buf
&& len
<=n
-sizeof(wxUint32
))
945 // swap 32bit MB to 16bit String
946 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
950 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
953 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
954 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
959 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
960 if (pa
== (size_t)-1)
970 psz
+= sizeof(wxUint32
);
980 // swap 16bit String to 32bit MB
981 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
985 while (*psz
&& (!buf
|| len
< n
))
989 // cast is ok for WC_UTF16
990 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
991 if (pa
== (size_t)-1)
1001 len
+= sizeof(wxUint32
);
1005 if (buf
&& len
<=n
-sizeof(wxUint32
))
1014 // copy 32bit MB to 32bit String
1015 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1019 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1022 *buf
++ = *(wxUint32
*)psz
;
1024 psz
+= sizeof(wxUint32
);
1034 // copy 32bit String to 32bit MB
1035 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1039 while (*psz
&& (!buf
|| len
< n
))
1043 *(wxUint32
*)buf
= *psz
;
1044 buf
+= sizeof(wxUint32
);
1047 len
+= sizeof(wxUint32
);
1051 if (buf
&& len
<=n
-sizeof(wxUint32
))
1058 // swap 32bit MB to 32bit String
1059 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1063 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1067 ((char *)buf
)[0] = psz
[3];
1068 ((char *)buf
)[1] = psz
[2];
1069 ((char *)buf
)[2] = psz
[1];
1070 ((char *)buf
)[3] = psz
[0];
1074 psz
+= sizeof(wxUint32
);
1084 // swap 32bit String to 32bit MB
1085 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1089 while (*psz
&& (!buf
|| len
< n
))
1093 *buf
++ = ((char *)psz
)[3];
1094 *buf
++ = ((char *)psz
)[2];
1095 *buf
++ = ((char *)psz
)[1];
1096 *buf
++ = ((char *)psz
)[0];
1098 len
+= sizeof(wxUint32
);
1102 if (buf
&& len
<=n
-sizeof(wxUint32
))
1112 // ============================================================================
1113 // The classes doing conversion using the iconv_xxx() functions
1114 // ============================================================================
1118 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
1119 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
1120 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
1121 // (which means error) and says there are 0 bytes left in the input buffer --
1122 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
1123 // this alternative test for iconv() failure.
1124 // [This bug does not appear in glibc 2.2.]
1125 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1126 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1127 (errno != E2BIG || bufLeft != 0))
1129 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1132 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1134 // ----------------------------------------------------------------------------
1135 // wxMBConv_iconv: encapsulates an iconv character set
1136 // ----------------------------------------------------------------------------
1138 class wxMBConv_iconv
: public wxMBConv
1141 wxMBConv_iconv(const wxChar
*name
);
1142 virtual ~wxMBConv_iconv();
1144 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1145 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1148 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
1151 // the iconv handlers used to translate from multibyte to wide char and in
1152 // the other direction
1157 // the name (for iconv_open()) of a wide char charset -- if none is
1158 // available on this machine, it will remain NULL
1159 static const char *ms_wcCharsetName
;
1161 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1162 // different endian-ness than the native one
1163 static bool ms_wcNeedsSwap
;
1166 const char *wxMBConv_iconv::ms_wcCharsetName
= NULL
;
1167 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1169 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1171 // Do it the hard way
1173 for (size_t i
= 0; i
< wxStrlen(name
)+1; i
++)
1174 cname
[i
] = (char) name
[i
];
1176 // check for charset that represents wchar_t:
1177 if (ms_wcCharsetName
== NULL
)
1179 ms_wcNeedsSwap
= false;
1181 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1182 ms_wcCharsetName
= WC_NAME_BEST
;
1183 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1185 if (m2w
== (iconv_t
)-1)
1187 // try charset w/o bytesex info (e.g. "UCS4")
1188 // and check for bytesex ourselves:
1189 ms_wcCharsetName
= WC_NAME
;
1190 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1192 // last bet, try if it knows WCHAR_T pseudo-charset
1193 if (m2w
== (iconv_t
)-1)
1195 ms_wcCharsetName
= "WCHAR_T";
1196 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1199 if (m2w
!= (iconv_t
)-1)
1201 char buf
[2], *bufPtr
;
1202 wchar_t wbuf
[2], *wbufPtr
;
1210 outsz
= SIZEOF_WCHAR_T
* 2;
1214 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1215 (char**)&wbufPtr
, &outsz
);
1217 if (ICONV_FAILED(res
, insz
))
1219 ms_wcCharsetName
= NULL
;
1220 wxLogLastError(wxT("iconv"));
1221 wxLogError(_("Conversion to charset '%s' doesn't work."), name
);
1225 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1230 ms_wcCharsetName
= NULL
;
1232 // VS: we must not output an error here, since wxWidgets will safely
1233 // fall back to using wxEncodingConverter.
1234 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name
);
1238 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName
, ms_wcNeedsSwap
);
1240 else // we already have ms_wcCharsetName
1242 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1245 // NB: don't ever pass NULL to iconv_open(), it may crash!
1246 if ( ms_wcCharsetName
)
1248 w2m
= iconv_open( cname
, ms_wcCharsetName
);
1256 wxMBConv_iconv::~wxMBConv_iconv()
1258 if ( m2w
!= (iconv_t
)-1 )
1260 if ( w2m
!= (iconv_t
)-1 )
1264 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1266 size_t inbuf
= strlen(psz
);
1267 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1269 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1270 wchar_t *bufPtr
= buf
;
1271 const char *pszPtr
= psz
;
1275 // have destination buffer, convert there
1277 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1278 (char**)&bufPtr
, &outbuf
);
1279 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1283 // convert to native endianness
1284 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
1287 // NB: iconv was given only strlen(psz) characters on input, and so
1288 // it couldn't convert the trailing zero. Let's do it ourselves
1289 // if there's some room left for it in the output buffer.
1295 // no destination buffer... convert using temp buffer
1296 // to calculate destination buffer requirement
1301 outbuf
= 8*SIZEOF_WCHAR_T
;
1304 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1305 (char**)&bufPtr
, &outbuf
);
1307 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1308 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1311 if (ICONV_FAILED(cres
, inbuf
))
1313 //VS: it is ok if iconv fails, hence trace only
1314 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1321 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1323 size_t inbuf
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1327 wchar_t *tmpbuf
= 0;
1331 // need to copy to temp buffer to switch endianness
1332 // this absolutely doesn't rock!
1333 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1334 // could be in read-only memory, or be accessed in some other thread)
1335 tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
1336 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
1337 WC_BSWAP(tmpbuf
, inbuf
)
1343 // have destination buffer, convert there
1344 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1348 // NB: iconv was given only wcslen(psz) characters on input, and so
1349 // it couldn't convert the trailing zero. Let's do it ourselves
1350 // if there's some room left for it in the output buffer.
1356 // no destination buffer... convert using temp buffer
1357 // to calculate destination buffer requirement
1361 buf
= tbuf
; outbuf
= 16;
1363 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1366 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1374 if (ICONV_FAILED(cres
, inbuf
))
1376 //VS: it is ok if iconv fails, hence trace only
1377 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1384 #endif // HAVE_ICONV
1387 // ============================================================================
1388 // Win32 conversion classes
1389 // ============================================================================
1391 #ifdef wxHAVE_WIN32_MB2WC
1395 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1396 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1399 class wxMBConv_win32
: public wxMBConv
1404 m_CodePage
= CP_ACP
;
1408 wxMBConv_win32(const wxChar
* name
)
1410 m_CodePage
= wxCharsetToCodepage(name
);
1413 wxMBConv_win32(wxFontEncoding encoding
)
1415 m_CodePage
= wxEncodingToCodepage(encoding
);
1419 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1421 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1422 // the behaviour is not compatible with the Unix version (using iconv)
1423 // and break the library itself, e.g. wxTextInputStream::NextChar()
1424 // wouldn't work if reading an incomplete MB char didn't result in an
1426 const size_t len
= ::MultiByteToWideChar
1428 m_CodePage
, // code page
1429 MB_ERR_INVALID_CHARS
, // flags: fall on error
1430 psz
, // input string
1431 -1, // its length (NUL-terminated)
1432 buf
, // output string
1433 buf
? n
: 0 // size of output buffer
1436 // note that it returns count of written chars for buf != NULL and size
1437 // of the needed buffer for buf == NULL so in either case the length of
1438 // the string (which never includes the terminating NUL) is one less
1439 return len
? len
- 1 : (size_t)-1;
1442 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1445 we have a problem here: by default, WideCharToMultiByte() may
1446 replace characters unrepresentable in the target code page with bad
1447 quality approximations such as turning "1/2" symbol (U+00BD) into
1448 "1" for the code pages which don't have it and we, obviously, want
1449 to avoid this at any price
1451 the trouble is that this function does it _silently_, i.e. it won't
1452 even tell us whether it did or not... Win98/2000 and higher provide
1453 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1454 we have to resort to a round trip, i.e. check that converting back
1455 results in the same string -- this is, of course, expensive but
1456 otherwise we simply can't be sure to not garble the data.
1459 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1460 // it doesn't work with CJK encodings (which we test for rather roughly
1461 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1463 BOOL usedDef
wxDUMMY_INITIALIZE(false);
1466 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1468 // it's our lucky day
1469 flags
= WC_NO_BEST_FIT_CHARS
;
1470 pUsedDef
= &usedDef
;
1472 else // old system or unsupported encoding
1478 const size_t len
= ::WideCharToMultiByte
1480 m_CodePage
, // code page
1481 flags
, // either none or no best fit
1482 pwz
, // input string
1483 -1, // it is (wide) NUL-terminated
1484 buf
, // output buffer
1485 buf
? n
: 0, // and its size
1486 NULL
, // default "replacement" char
1487 pUsedDef
// [out] was it used?
1492 // function totally failed
1496 // if we were really converting, check if we succeeded
1501 // check if the conversion failed, i.e. if any replacements
1506 else // we must resort to double tripping...
1508 wxWCharBuffer
wcBuf(n
);
1509 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1510 wcscmp(wcBuf
, pwz
) != 0 )
1512 // we didn't obtain the same thing we started from, hence
1513 // the conversion was lossy and we consider that it failed
1519 // see the comment above for the reason of "len - 1"
1523 bool IsOk() const { return m_CodePage
!= -1; }
1526 static bool CanUseNoBestFit()
1528 static int s_isWin98Or2k
= -1;
1530 if ( s_isWin98Or2k
== -1 )
1533 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1536 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1540 s_isWin98Or2k
= verMaj
>= 5;
1544 // unknown, be conseravtive by default
1548 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1551 return s_isWin98Or2k
== 1;
1557 #endif // wxHAVE_WIN32_MB2WC
1559 // ============================================================================
1560 // Cocoa conversion classes
1561 // ============================================================================
1563 #if defined(__WXCOCOA__)
1565 // RN: There is no UTF-32 support in either Core Foundation or
1566 // Cocoa. Strangely enough, internally Core Foundation uses
1567 // UTF 32 internally quite a bit - its just not public (yet).
1569 #include <CoreFoundation/CFString.h>
1570 #include <CoreFoundation/CFStringEncodingExt.h>
1572 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
1574 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
1575 if ( encoding
== wxFONTENCODING_DEFAULT
)
1577 enc
= CFStringGetSystemEncoding();
1579 else switch( encoding
)
1581 case wxFONTENCODING_ISO8859_1
:
1582 enc
= kCFStringEncodingISOLatin1
;
1584 case wxFONTENCODING_ISO8859_2
:
1585 enc
= kCFStringEncodingISOLatin2
;
1587 case wxFONTENCODING_ISO8859_3
:
1588 enc
= kCFStringEncodingISOLatin3
;
1590 case wxFONTENCODING_ISO8859_4
:
1591 enc
= kCFStringEncodingISOLatin4
;
1593 case wxFONTENCODING_ISO8859_5
:
1594 enc
= kCFStringEncodingISOLatinCyrillic
;
1596 case wxFONTENCODING_ISO8859_6
:
1597 enc
= kCFStringEncodingISOLatinArabic
;
1599 case wxFONTENCODING_ISO8859_7
:
1600 enc
= kCFStringEncodingISOLatinGreek
;
1602 case wxFONTENCODING_ISO8859_8
:
1603 enc
= kCFStringEncodingISOLatinHebrew
;
1605 case wxFONTENCODING_ISO8859_9
:
1606 enc
= kCFStringEncodingISOLatin5
;
1608 case wxFONTENCODING_ISO8859_10
:
1609 enc
= kCFStringEncodingISOLatin6
;
1611 case wxFONTENCODING_ISO8859_11
:
1612 enc
= kCFStringEncodingISOLatinThai
;
1614 case wxFONTENCODING_ISO8859_13
:
1615 enc
= kCFStringEncodingISOLatin7
;
1617 case wxFONTENCODING_ISO8859_14
:
1618 enc
= kCFStringEncodingISOLatin8
;
1620 case wxFONTENCODING_ISO8859_15
:
1621 enc
= kCFStringEncodingISOLatin9
;
1624 case wxFONTENCODING_KOI8
:
1625 enc
= kCFStringEncodingKOI8_R
;
1627 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
1628 enc
= kCFStringEncodingDOSRussian
;
1631 // case wxFONTENCODING_BULGARIAN :
1635 case wxFONTENCODING_CP437
:
1636 enc
=kCFStringEncodingDOSLatinUS
;
1638 case wxFONTENCODING_CP850
:
1639 enc
= kCFStringEncodingDOSLatin1
;
1641 case wxFONTENCODING_CP852
:
1642 enc
= kCFStringEncodingDOSLatin2
;
1644 case wxFONTENCODING_CP855
:
1645 enc
= kCFStringEncodingDOSCyrillic
;
1647 case wxFONTENCODING_CP866
:
1648 enc
=kCFStringEncodingDOSRussian
;
1650 case wxFONTENCODING_CP874
:
1651 enc
= kCFStringEncodingDOSThai
;
1653 case wxFONTENCODING_CP932
:
1654 enc
= kCFStringEncodingDOSJapanese
;
1656 case wxFONTENCODING_CP936
:
1657 enc
=kCFStringEncodingDOSChineseSimplif
;
1659 case wxFONTENCODING_CP949
:
1660 enc
= kCFStringEncodingDOSKorean
;
1662 case wxFONTENCODING_CP950
:
1663 enc
= kCFStringEncodingDOSChineseTrad
;
1665 case wxFONTENCODING_CP1250
:
1666 enc
= kCFStringEncodingWindowsLatin2
;
1668 case wxFONTENCODING_CP1251
:
1669 enc
=kCFStringEncodingWindowsCyrillic
;
1671 case wxFONTENCODING_CP1252
:
1672 enc
=kCFStringEncodingWindowsLatin1
;
1674 case wxFONTENCODING_CP1253
:
1675 enc
= kCFStringEncodingWindowsGreek
;
1677 case wxFONTENCODING_CP1254
:
1678 enc
= kCFStringEncodingWindowsLatin5
;
1680 case wxFONTENCODING_CP1255
:
1681 enc
=kCFStringEncodingWindowsHebrew
;
1683 case wxFONTENCODING_CP1256
:
1684 enc
=kCFStringEncodingWindowsArabic
;
1686 case wxFONTENCODING_CP1257
:
1687 enc
= kCFStringEncodingWindowsBalticRim
;
1689 // This only really encodes to UTF7 (if that) evidently
1690 // case wxFONTENCODING_UTF7 :
1691 // enc = kCFStringEncodingNonLossyASCII ;
1693 case wxFONTENCODING_UTF8
:
1694 enc
= kCFStringEncodingUTF8
;
1696 case wxFONTENCODING_EUC_JP
:
1697 enc
= kCFStringEncodingEUC_JP
;
1699 case wxFONTENCODING_UTF16
:
1700 enc
= kCFStringEncodingUnicode
;
1702 case wxFONTENCODING_MACROMAN
:
1703 enc
= kCFStringEncodingMacRoman
;
1705 case wxFONTENCODING_MACJAPANESE
:
1706 enc
= kCFStringEncodingMacJapanese
;
1708 case wxFONTENCODING_MACCHINESETRAD
:
1709 enc
= kCFStringEncodingMacChineseTrad
;
1711 case wxFONTENCODING_MACKOREAN
:
1712 enc
= kCFStringEncodingMacKorean
;
1714 case wxFONTENCODING_MACARABIC
:
1715 enc
= kCFStringEncodingMacArabic
;
1717 case wxFONTENCODING_MACHEBREW
:
1718 enc
= kCFStringEncodingMacHebrew
;
1720 case wxFONTENCODING_MACGREEK
:
1721 enc
= kCFStringEncodingMacGreek
;
1723 case wxFONTENCODING_MACCYRILLIC
:
1724 enc
= kCFStringEncodingMacCyrillic
;
1726 case wxFONTENCODING_MACDEVANAGARI
:
1727 enc
= kCFStringEncodingMacDevanagari
;
1729 case wxFONTENCODING_MACGURMUKHI
:
1730 enc
= kCFStringEncodingMacGurmukhi
;
1732 case wxFONTENCODING_MACGUJARATI
:
1733 enc
= kCFStringEncodingMacGujarati
;
1735 case wxFONTENCODING_MACORIYA
:
1736 enc
= kCFStringEncodingMacOriya
;
1738 case wxFONTENCODING_MACBENGALI
:
1739 enc
= kCFStringEncodingMacBengali
;
1741 case wxFONTENCODING_MACTAMIL
:
1742 enc
= kCFStringEncodingMacTamil
;
1744 case wxFONTENCODING_MACTELUGU
:
1745 enc
= kCFStringEncodingMacTelugu
;
1747 case wxFONTENCODING_MACKANNADA
:
1748 enc
= kCFStringEncodingMacKannada
;
1750 case wxFONTENCODING_MACMALAJALAM
:
1751 enc
= kCFStringEncodingMacMalayalam
;
1753 case wxFONTENCODING_MACSINHALESE
:
1754 enc
= kCFStringEncodingMacSinhalese
;
1756 case wxFONTENCODING_MACBURMESE
:
1757 enc
= kCFStringEncodingMacBurmese
;
1759 case wxFONTENCODING_MACKHMER
:
1760 enc
= kCFStringEncodingMacKhmer
;
1762 case wxFONTENCODING_MACTHAI
:
1763 enc
= kCFStringEncodingMacThai
;
1765 case wxFONTENCODING_MACLAOTIAN
:
1766 enc
= kCFStringEncodingMacLaotian
;
1768 case wxFONTENCODING_MACGEORGIAN
:
1769 enc
= kCFStringEncodingMacGeorgian
;
1771 case wxFONTENCODING_MACARMENIAN
:
1772 enc
= kCFStringEncodingMacArmenian
;
1774 case wxFONTENCODING_MACCHINESESIMP
:
1775 enc
= kCFStringEncodingMacChineseSimp
;
1777 case wxFONTENCODING_MACTIBETAN
:
1778 enc
= kCFStringEncodingMacTibetan
;
1780 case wxFONTENCODING_MACMONGOLIAN
:
1781 enc
= kCFStringEncodingMacMongolian
;
1783 case wxFONTENCODING_MACETHIOPIC
:
1784 enc
= kCFStringEncodingMacEthiopic
;
1786 case wxFONTENCODING_MACCENTRALEUR
:
1787 enc
= kCFStringEncodingMacCentralEurRoman
;
1789 case wxFONTENCODING_MACVIATNAMESE
:
1790 enc
= kCFStringEncodingMacVietnamese
;
1792 case wxFONTENCODING_MACARABICEXT
:
1793 enc
= kCFStringEncodingMacExtArabic
;
1795 case wxFONTENCODING_MACSYMBOL
:
1796 enc
= kCFStringEncodingMacSymbol
;
1798 case wxFONTENCODING_MACDINGBATS
:
1799 enc
= kCFStringEncodingMacDingbats
;
1801 case wxFONTENCODING_MACTURKISH
:
1802 enc
= kCFStringEncodingMacTurkish
;
1804 case wxFONTENCODING_MACCROATIAN
:
1805 enc
= kCFStringEncodingMacCroatian
;
1807 case wxFONTENCODING_MACICELANDIC
:
1808 enc
= kCFStringEncodingMacIcelandic
;
1810 case wxFONTENCODING_MACROMANIAN
:
1811 enc
= kCFStringEncodingMacRomanian
;
1813 case wxFONTENCODING_MACCELTIC
:
1814 enc
= kCFStringEncodingMacCeltic
;
1816 case wxFONTENCODING_MACGAELIC
:
1817 enc
= kCFStringEncodingMacGaelic
;
1819 // case wxFONTENCODING_MACKEYBOARD :
1820 // enc = kCFStringEncodingMacKeyboardGlyphs ;
1823 // because gcc is picky
1829 class wxMBConv_cocoa
: public wxMBConv
1834 Init(CFStringGetSystemEncoding()) ;
1837 wxMBConv_cocoa(const wxChar
* name
)
1839 Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, false) ) ) ;
1842 wxMBConv_cocoa(wxFontEncoding encoding
)
1844 Init( wxCFStringEncFromFontEnc(encoding
) );
1851 void Init( CFStringEncoding encoding
)
1853 m_encoding
= encoding
;
1856 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
1860 CFStringRef theString
= CFStringCreateWithBytes (
1861 NULL
, //the allocator
1862 (const UInt8
*)szUnConv
,
1865 false //no BOM/external representation
1868 wxASSERT(theString
);
1870 size_t nOutLength
= CFStringGetLength(theString
);
1874 CFRelease(theString
);
1878 CFRange theRange
= { 0, nOutSize
};
1880 #if SIZEOF_WCHAR_T == 4
1881 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
1884 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
1886 CFRelease(theString
);
1888 szUniCharBuffer
[nOutLength
] = '\0' ;
1890 #if SIZEOF_WCHAR_T == 4
1891 wxMBConvUTF16 converter
;
1892 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
1893 delete[] szUniCharBuffer
;
1899 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
1903 size_t nRealOutSize
;
1904 size_t nBufSize
= wxWcslen(szUnConv
);
1905 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
1907 #if SIZEOF_WCHAR_T == 4
1908 wxMBConvUTF16BE converter
;
1909 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
1910 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
1911 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
1912 nBufSize
/= sizeof(UniChar
);
1915 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
1919 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
1922 wxASSERT(theString
);
1924 //Note that CER puts a BOM when converting to unicode
1925 //so we check and use getchars instead in that case
1926 if (m_encoding
== kCFStringEncodingUnicode
)
1929 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
1931 nRealOutSize
= CFStringGetLength(theString
) + 1;
1937 CFRangeMake(0, CFStringGetLength(theString
)),
1939 0, //what to put in characters that can't be converted -
1940 //0 tells CFString to return NULL if it meets such a character
1941 false, //not an external representation
1944 (CFIndex
*) &nRealOutSize
1948 CFRelease(theString
);
1950 #if SIZEOF_WCHAR_T == 4
1951 delete[] szUniBuffer
;
1954 return nRealOutSize
- 1;
1959 return m_encoding
!= kCFStringEncodingInvalidId
&&
1960 CFStringIsEncodingAvailable(m_encoding
);
1964 CFStringEncoding m_encoding
;
1967 #endif // defined(__WXCOCOA__)
1969 // ============================================================================
1970 // Mac conversion classes
1971 // ============================================================================
1973 #if defined(__WXMAC__) && defined(TARGET_CARBON)
1975 class wxMBConv_mac
: public wxMBConv
1980 Init(CFStringGetSystemEncoding()) ;
1983 wxMBConv_mac(const wxChar
* name
)
1985 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, false) ) ) ;
1988 wxMBConv_mac(wxFontEncoding encoding
)
1990 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
1995 OSStatus status
= noErr
;
1996 status
= TECDisposeConverter(m_MB2WC_converter
);
1997 status
= TECDisposeConverter(m_WC2MB_converter
);
2001 void Init( TextEncodingBase encoding
)
2003 OSStatus status
= noErr
;
2004 m_char_encoding
= encoding
;
2005 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2007 status
= TECCreateConverter(&m_MB2WC_converter
,
2009 m_unicode_encoding
);
2010 status
= TECCreateConverter(&m_WC2MB_converter
,
2015 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2017 OSStatus status
= noErr
;
2018 ByteCount byteOutLen
;
2019 ByteCount byteInLen
= strlen(psz
) ;
2020 wchar_t *tbuf
= NULL
;
2021 UniChar
* ubuf
= NULL
;
2026 //apple specs say at least 32
2027 n
= wxMax( 32 , byteInLen
) ;
2028 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2030 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2031 #if SIZEOF_WCHAR_T == 4
2032 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2034 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2036 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2037 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2038 #if SIZEOF_WCHAR_T == 4
2039 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2040 // is not properly terminated we get random characters at the end
2041 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2042 wxMBConvUTF16BE converter
;
2043 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2046 res
= byteOutLen
/ sizeof( UniChar
) ;
2051 if ( buf
&& res
< n
)
2057 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2059 OSStatus status
= noErr
;
2060 ByteCount byteOutLen
;
2061 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2067 //apple specs say at least 32
2068 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2069 tbuf
= (char*) malloc( n
) ;
2072 ByteCount byteBufferLen
= n
;
2073 UniChar
* ubuf
= NULL
;
2074 #if SIZEOF_WCHAR_T == 4
2075 wxMBConvUTF16BE converter
;
2076 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2077 byteInLen
= unicharlen
;
2078 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2079 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2081 ubuf
= (UniChar
*) psz
;
2083 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2084 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2085 #if SIZEOF_WCHAR_T == 4
2091 size_t res
= byteOutLen
;
2092 if ( buf
&& res
< n
)
2096 //we need to double-trip to verify it didn't insert any ? in place
2097 //of bogus characters
2098 wxWCharBuffer
wcBuf(n
);
2099 size_t pszlen
= wxWcslen(psz
);
2100 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2101 wxWcslen(wcBuf
) != pszlen
||
2102 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2104 // we didn't obtain the same thing we started from, hence
2105 // the conversion was lossy and we consider that it failed
2114 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2117 TECObjectRef m_MB2WC_converter
;
2118 TECObjectRef m_WC2MB_converter
;
2120 TextEncodingBase m_char_encoding
;
2121 TextEncodingBase m_unicode_encoding
;
2124 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2126 // ============================================================================
2127 // wxEncodingConverter based conversion classes
2128 // ============================================================================
2132 class wxMBConv_wxwin
: public wxMBConv
2137 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2138 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2142 // temporarily just use wxEncodingConverter stuff,
2143 // so that it works while a better implementation is built
2144 wxMBConv_wxwin(const wxChar
* name
)
2147 m_enc
= wxFontMapper::Get()->CharsetToEncoding(name
, false);
2149 m_enc
= wxFONTENCODING_SYSTEM
;
2154 wxMBConv_wxwin(wxFontEncoding enc
)
2161 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2163 size_t inbuf
= strlen(psz
);
2165 m2w
.Convert(psz
,buf
);
2169 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2171 const size_t inbuf
= wxWcslen(psz
);
2173 w2m
.Convert(psz
,buf
);
2178 bool IsOk() const { return m_ok
; }
2181 wxFontEncoding m_enc
;
2182 wxEncodingConverter m2w
, w2m
;
2184 // were we initialized successfully?
2187 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2190 #endif // wxUSE_FONTMAP
2192 // ============================================================================
2193 // wxCSConv implementation
2194 // ============================================================================
2196 void wxCSConv::Init()
2203 wxCSConv::wxCSConv(const wxChar
*charset
)
2212 m_encoding
= wxFONTENCODING_SYSTEM
;
2215 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2217 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2219 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2221 encoding
= wxFONTENCODING_SYSTEM
;
2226 m_encoding
= encoding
;
2229 wxCSConv::~wxCSConv()
2234 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2239 SetName(conv
.m_name
);
2240 m_encoding
= conv
.m_encoding
;
2243 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2247 SetName(conv
.m_name
);
2248 m_encoding
= conv
.m_encoding
;
2253 void wxCSConv::Clear()
2262 void wxCSConv::SetName(const wxChar
*charset
)
2266 m_name
= wxStrdup(charset
);
2271 wxMBConv
*wxCSConv::DoCreate() const
2273 // check for the special case of ASCII or ISO8859-1 charset: as we have
2274 // special knowledge of it anyhow, we don't need to create a special
2275 // conversion object
2276 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
2278 // don't convert at all
2282 // we trust OS to do conversion better than we can so try external
2283 // conversion methods first
2285 // the full order is:
2286 // 1. OS conversion (iconv() under Unix or Win32 API)
2287 // 2. hard coded conversions for UTF
2288 // 3. wxEncodingConverter as fall back
2294 #endif // !wxUSE_FONTMAP
2296 wxString
name(m_name
);
2300 name
= wxFontMapper::Get()->GetEncodingName(m_encoding
);
2301 #endif // wxUSE_FONTMAP
2303 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
2309 #endif // HAVE_ICONV
2311 #ifdef wxHAVE_WIN32_MB2WC
2314 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2315 : new wxMBConv_win32(m_encoding
);
2324 #endif // wxHAVE_WIN32_MB2WC
2325 #if defined(__WXMAC__)
2327 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
) )
2330 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
2331 : new wxMBConv_mac(m_encoding
);
2339 #if defined(__WXCOCOA__)
2341 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
2344 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
2345 : new wxMBConv_cocoa(m_encoding
);
2354 wxFontEncoding enc
= m_encoding
;
2356 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2358 // use "false" to suppress interactive dialogs -- we can be called from
2359 // anywhere and popping up a dialog from here is the last thing we want to
2361 enc
= wxFontMapper::Get()->CharsetToEncoding(m_name
, false);
2363 #endif // wxUSE_FONTMAP
2367 case wxFONTENCODING_UTF7
:
2368 return new wxMBConvUTF7
;
2370 case wxFONTENCODING_UTF8
:
2371 return new wxMBConvUTF8
;
2373 case wxFONTENCODING_UTF16BE
:
2374 return new wxMBConvUTF16BE
;
2376 case wxFONTENCODING_UTF16LE
:
2377 return new wxMBConvUTF16LE
;
2379 case wxFONTENCODING_UTF32BE
:
2380 return new wxMBConvUTF32BE
;
2382 case wxFONTENCODING_UTF32LE
:
2383 return new wxMBConvUTF32LE
;
2386 // nothing to do but put here to suppress gcc warnings
2393 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2394 : new wxMBConv_wxwin(m_encoding
);
2400 #endif // wxUSE_FONTMAP
2402 // NB: This is a hack to prevent deadlock. What could otherwise happen
2403 // in Unicode build: wxConvLocal creation ends up being here
2404 // because of some failure and logs the error. But wxLog will try to
2405 // attach timestamp, for which it will need wxConvLocal (to convert
2406 // time to char* and then wchar_t*), but that fails, tries to log
2407 // error, but wxLog has a (already locked) critical section that
2408 // guards static buffer.
2409 static bool alreadyLoggingError
= false;
2410 if (!alreadyLoggingError
)
2412 alreadyLoggingError
= true;
2413 wxLogError(_("Cannot convert from the charset '%s'!"),
2417 wxFontMapper::GetEncodingDescription(m_encoding
).c_str()
2418 #else // !wxUSE_FONTMAP
2419 wxString::Format(_("encoding %s"), m_encoding
).c_str()
2420 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2422 alreadyLoggingError
= false;
2428 void wxCSConv::CreateConvIfNeeded() const
2432 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
2435 // if we don't have neither the name nor the encoding, use the default
2436 // encoding for this system
2437 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
2439 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
2441 #endif // wxUSE_INTL
2443 self
->m_convReal
= DoCreate();
2444 self
->m_deferred
= false;
2448 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2450 CreateConvIfNeeded();
2453 return m_convReal
->MB2WC(buf
, psz
, n
);
2456 size_t len
= strlen(psz
);
2460 for (size_t c
= 0; c
<= len
; c
++)
2461 buf
[c
] = (unsigned char)(psz
[c
]);
2467 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2469 CreateConvIfNeeded();
2472 return m_convReal
->WC2MB(buf
, psz
, n
);
2475 const size_t len
= wxWcslen(psz
);
2478 for (size_t c
= 0; c
<= len
; c
++)
2482 buf
[c
] = (char)psz
[c
];
2487 for (size_t c
= 0; c
<= len
; c
++)
2497 // ----------------------------------------------------------------------------
2499 // ----------------------------------------------------------------------------
2502 static wxMBConv_win32 wxConvLibcObj
;
2503 #elif defined(__WXMAC__) && !defined(__MACH__)
2504 static wxMBConv_mac wxConvLibcObj
;
2506 static wxMBConvLibc wxConvLibcObj
;
2509 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
2510 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
2511 static wxMBConvUTF7 wxConvUTF7Obj
;
2512 static wxMBConvUTF8 wxConvUTF8Obj
;
2515 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
2516 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
2517 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
2518 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
2519 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
2520 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
2522 #else // !wxUSE_WCHAR_T
2524 // stand-ins in absence of wchar_t
2525 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
2530 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T