1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
39 #include "wx/strconv.h"
44 #include "wx/msw/private.h"
48 #include "wx/msw/missing.h"
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
63 // ----------------------------------------------------------------------------
65 // ----------------------------------------------------------------------------
73 #include "wx/thread.h"
76 #include "wx/encconv.h"
77 #include "wx/fontmap.h"
81 #include <ATSUnicode.h>
82 #include <TextCommon.h>
83 #include <TextEncodingConverter.h>
85 #include "wx/mac/private.h" // includes mac headers
87 // ----------------------------------------------------------------------------
89 // ----------------------------------------------------------------------------
91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
94 #if SIZEOF_WCHAR_T == 4
95 #define WC_NAME "UCS4"
96 #define WC_BSWAP BSWAP_UCS4
97 #ifdef WORDS_BIGENDIAN
98 #define WC_NAME_BEST "UCS-4BE"
100 #define WC_NAME_BEST "UCS-4LE"
102 #elif SIZEOF_WCHAR_T == 2
103 #define WC_NAME "UTF16"
104 #define WC_BSWAP BSWAP_UTF16
106 #ifdef WORDS_BIGENDIAN
107 #define WC_NAME_BEST "UTF-16BE"
109 #define WC_NAME_BEST "UTF-16LE"
111 #else // sizeof(wchar_t) != 2 nor 4
112 // does this ever happen?
113 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
116 // ============================================================================
118 // ============================================================================
120 // ----------------------------------------------------------------------------
121 // UTF-16 en/decoding to/from UCS-4
122 // ----------------------------------------------------------------------------
125 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
130 *output
= (wxUint16
) input
;
133 else if (input
>=0x110000)
141 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
142 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
148 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
150 if ((*input
<0xd800) || (*input
>0xdfff))
155 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
162 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
168 // ----------------------------------------------------------------------------
170 // ----------------------------------------------------------------------------
172 wxMBConv::~wxMBConv()
174 // nothing to do here (necessary for Darwin linking probably)
177 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
181 // calculate the length of the buffer needed first
182 size_t nLen
= MB2WC(NULL
, psz
, 0);
183 if ( nLen
!= (size_t)-1 )
185 // now do the actual conversion
186 wxWCharBuffer
buf(nLen
);
187 nLen
= MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NULL
188 if ( nLen
!= (size_t)-1 )
195 wxWCharBuffer
buf((wchar_t *)NULL
);
200 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
204 size_t nLen
= WC2MB(NULL
, pwz
, 0);
205 if ( nLen
!= (size_t)-1 )
207 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
208 nLen
= WC2MB(buf
.data(), pwz
, nLen
+ 4);
209 if ( nLen
!= (size_t)-1 )
216 wxCharBuffer
buf((char *)NULL
);
221 const wxWCharBuffer
wxMBConv::cMB2WC(const char *szString
, size_t nStringLen
, size_t* pOutSize
) const
223 wxASSERT(pOutSize
!= NULL
);
225 const char* szEnd
= szString
+ nStringLen
+ 1;
226 const char* szPos
= szString
;
227 const char* szStart
= szPos
;
229 size_t nActualLength
= 0;
230 size_t nCurrentSize
= nStringLen
; //try normal size first (should never resize?)
232 wxWCharBuffer
theBuffer(nCurrentSize
);
234 //Convert the string until the length() is reached, continuing the
235 //loop every time a null character is reached
236 while(szPos
!= szEnd
)
238 wxASSERT(szPos
< szEnd
); //something is _really_ screwed up if this rings true
240 //Get the length of the current (sub)string
241 size_t nLen
= MB2WC(NULL
, szPos
, 0);
243 //Invalid conversion?
244 if( nLen
== (size_t)-1 )
247 theBuffer
.data()[0u] = wxT('\0');
252 //Increase the actual length (+1 for current null character)
253 nActualLength
+= nLen
+ 1;
255 //if buffer too big, realloc the buffer
256 if (nActualLength
> (nCurrentSize
+1))
258 wxWCharBuffer
theNewBuffer(nCurrentSize
<< 1);
259 memcpy(theNewBuffer
.data(), theBuffer
.data(), nCurrentSize
* sizeof(wchar_t));
260 theBuffer
= theNewBuffer
;
264 //Convert the current (sub)string
265 if ( MB2WC(&theBuffer
.data()[szPos
- szStart
], szPos
, nLen
+ 1) == (size_t)-1 )
268 theBuffer
.data()[0u] = wxT('\0');
272 //Increment to next (sub)string
273 //Note that we have to use strlen here instead of nLen
274 //here because XX2XX gives us the size of the output buffer,
275 //not neccessarly the length of the string
276 szPos
+= strlen(szPos
) + 1;
279 //success - return actual length and the buffer
280 *pOutSize
= nActualLength
;
284 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *szString
, size_t nStringLen
, size_t* pOutSize
) const
286 wxASSERT(pOutSize
!= NULL
);
288 const wchar_t* szEnd
= szString
+ nStringLen
+ 1;
289 const wchar_t* szPos
= szString
;
290 const wchar_t* szStart
= szPos
;
292 size_t nActualLength
= 0;
293 size_t nCurrentSize
= nStringLen
<< 2; //try * 4 first
295 wxCharBuffer
theBuffer(nCurrentSize
);
297 //Convert the string until the length() is reached, continuing the
298 //loop every time a null character is reached
299 while(szPos
!= szEnd
)
301 wxASSERT(szPos
< szEnd
); //something is _really_ screwed up if this rings true
303 //Get the length of the current (sub)string
304 size_t nLen
= WC2MB(NULL
, szPos
, 0);
306 //Invalid conversion?
307 if( nLen
== (size_t)-1 )
310 theBuffer
.data()[0u] = wxT('\0');
314 //Increase the actual length (+1 for current null character)
315 nActualLength
+= nLen
+ 1;
317 //if buffer too big, realloc the buffer
318 if (nActualLength
> (nCurrentSize
+1))
320 wxCharBuffer
theNewBuffer(nCurrentSize
<< 1);
321 memcpy(theNewBuffer
.data(), theBuffer
.data(), nCurrentSize
);
322 theBuffer
= theNewBuffer
;
326 //Convert the current (sub)string
327 if(WC2MB(&theBuffer
.data()[szPos
- szStart
], szPos
, nLen
+ 1) == (size_t)-1 )
330 theBuffer
.data()[0u] = wxT('\0');
334 //Increment to next (sub)string
335 //Note that we have to use wxWcslen here instead of nLen
336 //here because XX2XX gives us the size of the output buffer,
337 //not neccessarly the length of the string
338 szPos
+= wxWcslen(szPos
) + 1;
341 //success - return actual length and the buffer
342 *pOutSize
= nActualLength
;
346 // ----------------------------------------------------------------------------
348 // ----------------------------------------------------------------------------
350 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
352 return wxMB2WC(buf
, psz
, n
);
355 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
357 return wxWC2MB(buf
, psz
, n
);
359 // ----------------------------------------------------------------------------
361 // ----------------------------------------------------------------------------
363 // Implementation (C) 2004 Fredrik Roubert
366 // BASE64 decoding table
368 static const unsigned char utf7unb64
[] =
370 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
371 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
372 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
375 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
376 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
377 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
378 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
379 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
380 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
381 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
382 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
383 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
384 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
385 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
386 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
387 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
388 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
401 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
404 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
409 while (*psz
&& ((!buf
) || (len
< n
)))
411 unsigned char cc
= *psz
++;
419 else if (*psz
== '-')
429 // BASE64 encoded string
433 for (lsb
= false, d
= 0, l
= 0;
434 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff; psz
++)
438 for (l
+= 6; l
>= 8; lsb
= !lsb
)
440 c
= (unsigned char)((d
>> (l
-= 8)) % 256);
449 *buf
= (wchar_t)(c
<< 8);
456 if (buf
&& (len
< n
))
462 // BASE64 encoding table
464 static const unsigned char utf7enb64
[] =
466 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
467 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
468 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
469 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
470 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
471 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
472 'w', 'x', 'y', 'z', '0', '1', '2', '3',
473 '4', '5', '6', '7', '8', '9', '+', '/'
477 // UTF-7 encoding table
479 // 0 - Set D (directly encoded characters)
480 // 1 - Set O (optional direct characters)
481 // 2 - whitespace characters (optional)
482 // 3 - special characters
484 static const unsigned char utf7encode
[128] =
486 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
487 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
488 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
490 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
492 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
496 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t
497 *psz
, size_t n
) const
503 while (*psz
&& ((!buf
) || (len
< n
)))
506 if (cc
< 0x80 && utf7encode
[cc
] < 1)
514 else if (((wxUint32
)cc
) > 0xffff)
516 // no surrogate pair generation (yet?)
527 // BASE64 encode string
528 unsigned int lsb
, d
, l
;
529 for (d
= 0, l
= 0;; psz
++)
531 for (lsb
= 0; lsb
< 2; lsb
++)
534 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
536 for (l
+= 8; l
>= 6; )
540 *buf
++ = utf7enb64
[(d
>> l
) % 64];
545 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
551 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
560 if (buf
&& (len
< n
))
565 // ----------------------------------------------------------------------------
567 // ----------------------------------------------------------------------------
569 static wxUint32 utf8_max
[]=
570 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
572 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
576 while (*psz
&& ((!buf
) || (len
< n
)))
578 unsigned char cc
= *psz
++, fc
= cc
;
580 for (cnt
= 0; fc
& 0x80; cnt
++)
594 // invalid UTF-8 sequence
599 unsigned ocnt
= cnt
- 1;
600 wxUint32 res
= cc
& (0x3f >> cnt
);
604 if ((cc
& 0xC0) != 0x80)
606 // invalid UTF-8 sequence
609 res
= (res
<< 6) | (cc
& 0x3f);
611 if (res
<= utf8_max
[ocnt
])
613 // illegal UTF-8 encoding
617 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
618 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
619 if (pa
== (size_t)-1)
628 #endif // WC_UTF16/!WC_UTF16
632 if (buf
&& (len
< n
))
637 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
641 while (*psz
&& ((!buf
) || (len
< n
)))
645 // cast is ok for WC_UTF16
646 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
647 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
649 cc
=(*psz
++) & 0x7fffffff;
652 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
666 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
668 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
673 if (buf
&& (len
<n
)) *buf
= 0;
681 // ----------------------------------------------------------------------------
683 // ----------------------------------------------------------------------------
685 #ifdef WORDS_BIGENDIAN
686 #define wxMBConvUTF16straight wxMBConvUTF16BE
687 #define wxMBConvUTF16swap wxMBConvUTF16LE
689 #define wxMBConvUTF16swap wxMBConvUTF16BE
690 #define wxMBConvUTF16straight wxMBConvUTF16LE
696 // copy 16bit MB to 16bit String
697 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
701 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
704 *buf
++ = *(wxUint16
*)psz
;
707 psz
+= sizeof(wxUint16
);
709 if (buf
&& len
<n
) *buf
=0;
715 // copy 16bit String to 16bit MB
716 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
720 while (*psz
&& (!buf
|| len
< n
))
724 *(wxUint16
*)buf
= *psz
;
725 buf
+= sizeof(wxUint16
);
727 len
+= sizeof(wxUint16
);
730 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
736 // swap 16bit MB to 16bit String
737 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
741 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
745 ((char *)buf
)[0] = psz
[1];
746 ((char *)buf
)[1] = psz
[0];
750 psz
+= sizeof(wxUint16
);
752 if (buf
&& len
<n
) *buf
=0;
758 // swap 16bit MB to 16bit String
759 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
763 while (*psz
&& (!buf
|| len
< n
))
767 *buf
++ = ((char*)psz
)[1];
768 *buf
++ = ((char*)psz
)[0];
770 len
+= sizeof(wxUint16
);
773 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
782 // copy 16bit MB to 32bit String
783 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
787 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
790 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
791 if (pa
== (size_t)-1)
797 psz
+= pa
* sizeof(wxUint16
);
799 if (buf
&& len
<n
) *buf
=0;
805 // copy 32bit String to 16bit MB
806 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
810 while (*psz
&& (!buf
|| len
< n
))
813 size_t pa
=encode_utf16(*psz
, cc
);
815 if (pa
== (size_t)-1)
820 *(wxUint16
*)buf
= cc
[0];
821 buf
+= sizeof(wxUint16
);
824 *(wxUint16
*)buf
= cc
[1];
825 buf
+= sizeof(wxUint16
);
829 len
+= pa
*sizeof(wxUint16
);
832 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
838 // swap 16bit MB to 32bit String
839 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
843 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
847 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
848 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
850 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
851 if (pa
== (size_t)-1)
858 psz
+= pa
* sizeof(wxUint16
);
860 if (buf
&& len
<n
) *buf
=0;
866 // swap 32bit String to 16bit MB
867 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
871 while (*psz
&& (!buf
|| len
< n
))
874 size_t pa
=encode_utf16(*psz
, cc
);
876 if (pa
== (size_t)-1)
881 *buf
++ = ((char*)cc
)[1];
882 *buf
++ = ((char*)cc
)[0];
885 *buf
++ = ((char*)cc
)[3];
886 *buf
++ = ((char*)cc
)[2];
890 len
+= pa
*sizeof(wxUint16
);
893 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
901 // ----------------------------------------------------------------------------
903 // ----------------------------------------------------------------------------
905 #ifdef WORDS_BIGENDIAN
906 #define wxMBConvUTF32straight wxMBConvUTF32BE
907 #define wxMBConvUTF32swap wxMBConvUTF32LE
909 #define wxMBConvUTF32swap wxMBConvUTF32BE
910 #define wxMBConvUTF32straight wxMBConvUTF32LE
914 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
915 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
920 // copy 32bit MB to 16bit String
921 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
925 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
929 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
930 if (pa
== (size_t)-1)
940 psz
+= sizeof(wxUint32
);
942 if (buf
&& len
<n
) *buf
=0;
948 // copy 16bit String to 32bit MB
949 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
953 while (*psz
&& (!buf
|| len
< n
))
957 // cast is ok for WC_UTF16
958 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
959 if (pa
== (size_t)-1)
964 *(wxUint32
*)buf
= cc
;
965 buf
+= sizeof(wxUint32
);
967 len
+= sizeof(wxUint32
);
971 if (buf
&& len
<=n
-sizeof(wxUint32
))
979 // swap 32bit MB to 16bit String
980 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
984 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
987 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
988 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
993 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
994 if (pa
== (size_t)-1)
1004 psz
+= sizeof(wxUint32
);
1014 // swap 16bit String to 32bit MB
1015 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1019 while (*psz
&& (!buf
|| len
< n
))
1023 // cast is ok for WC_UTF16
1024 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
1025 if (pa
== (size_t)-1)
1035 len
+= sizeof(wxUint32
);
1039 if (buf
&& len
<=n
-sizeof(wxUint32
))
1048 // copy 32bit MB to 32bit String
1049 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1053 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1056 *buf
++ = *(wxUint32
*)psz
;
1058 psz
+= sizeof(wxUint32
);
1068 // copy 32bit String to 32bit MB
1069 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1073 while (*psz
&& (!buf
|| len
< n
))
1077 *(wxUint32
*)buf
= *psz
;
1078 buf
+= sizeof(wxUint32
);
1081 len
+= sizeof(wxUint32
);
1085 if (buf
&& len
<=n
-sizeof(wxUint32
))
1092 // swap 32bit MB to 32bit String
1093 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1097 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1101 ((char *)buf
)[0] = psz
[3];
1102 ((char *)buf
)[1] = psz
[2];
1103 ((char *)buf
)[2] = psz
[1];
1104 ((char *)buf
)[3] = psz
[0];
1108 psz
+= sizeof(wxUint32
);
1118 // swap 32bit String to 32bit MB
1119 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1123 while (*psz
&& (!buf
|| len
< n
))
1127 *buf
++ = ((char *)psz
)[3];
1128 *buf
++ = ((char *)psz
)[2];
1129 *buf
++ = ((char *)psz
)[1];
1130 *buf
++ = ((char *)psz
)[0];
1132 len
+= sizeof(wxUint32
);
1136 if (buf
&& len
<=n
-sizeof(wxUint32
))
1146 // ============================================================================
1147 // The classes doing conversion using the iconv_xxx() functions
1148 // ============================================================================
1152 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1153 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1154 // (unless there's yet another bug in glibc) the only case when iconv()
1155 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1156 // left in the input buffer -- when _real_ error occurs,
1157 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1159 // [This bug does not appear in glibc 2.2.]
1160 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1161 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1162 (errno != E2BIG || bufLeft != 0))
1164 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1167 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1169 // ----------------------------------------------------------------------------
1170 // wxMBConv_iconv: encapsulates an iconv character set
1171 // ----------------------------------------------------------------------------
1173 class wxMBConv_iconv
: public wxMBConv
1176 wxMBConv_iconv(const wxChar
*name
);
1177 virtual ~wxMBConv_iconv();
1179 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1180 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1183 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
1186 // the iconv handlers used to translate from multibyte to wide char and in
1187 // the other direction
1191 // guards access to m2w and w2m objects
1192 wxMutex m_iconvMutex
;
1196 // the name (for iconv_open()) of a wide char charset -- if none is
1197 // available on this machine, it will remain NULL
1198 static const char *ms_wcCharsetName
;
1200 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1201 // different endian-ness than the native one
1202 static bool ms_wcNeedsSwap
;
1205 const char *wxMBConv_iconv::ms_wcCharsetName
= NULL
;
1206 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1208 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1210 // Do it the hard way
1212 for (size_t i
= 0; i
< wxStrlen(name
)+1; i
++)
1213 cname
[i
] = (char) name
[i
];
1215 // check for charset that represents wchar_t:
1216 if (ms_wcCharsetName
== NULL
)
1218 ms_wcNeedsSwap
= false;
1220 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1221 ms_wcCharsetName
= WC_NAME_BEST
;
1222 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1224 if (m2w
== (iconv_t
)-1)
1226 // try charset w/o bytesex info (e.g. "UCS4")
1227 // and check for bytesex ourselves:
1228 ms_wcCharsetName
= WC_NAME
;
1229 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1231 // last bet, try if it knows WCHAR_T pseudo-charset
1232 if (m2w
== (iconv_t
)-1)
1234 ms_wcCharsetName
= "WCHAR_T";
1235 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1238 if (m2w
!= (iconv_t
)-1)
1240 char buf
[2], *bufPtr
;
1241 wchar_t wbuf
[2], *wbufPtr
;
1249 outsz
= SIZEOF_WCHAR_T
* 2;
1253 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1254 (char**)&wbufPtr
, &outsz
);
1256 if (ICONV_FAILED(res
, insz
))
1258 ms_wcCharsetName
= NULL
;
1259 wxLogLastError(wxT("iconv"));
1260 wxLogError(_("Conversion to charset '%s' doesn't work."), name
);
1264 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1269 ms_wcCharsetName
= NULL
;
1271 // VS: we must not output an error here, since wxWidgets will safely
1272 // fall back to using wxEncodingConverter.
1273 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name
);
1277 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName
, ms_wcNeedsSwap
);
1279 else // we already have ms_wcCharsetName
1281 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1284 // NB: don't ever pass NULL to iconv_open(), it may crash!
1285 if ( ms_wcCharsetName
)
1287 w2m
= iconv_open( cname
, ms_wcCharsetName
);
1295 wxMBConv_iconv::~wxMBConv_iconv()
1297 if ( m2w
!= (iconv_t
)-1 )
1299 if ( w2m
!= (iconv_t
)-1 )
1303 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1306 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1307 // Unfortunately there is a couple of global wxCSConv objects such as
1308 // wxConvLocal that are used all over wx code, so we have to make sure
1309 // the handle is used by at most one thread at the time. Otherwise
1310 // only a few wx classes would be safe to use from non-main threads
1311 // as MB<->WC conversion would fail "randomly".
1312 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1315 size_t inbuf
= strlen(psz
);
1316 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1318 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1319 wchar_t *bufPtr
= buf
;
1320 const char *pszPtr
= psz
;
1324 // have destination buffer, convert there
1326 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1327 (char**)&bufPtr
, &outbuf
);
1328 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1332 // convert to native endianness
1333 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
1336 // NB: iconv was given only strlen(psz) characters on input, and so
1337 // it couldn't convert the trailing zero. Let's do it ourselves
1338 // if there's some room left for it in the output buffer.
1344 // no destination buffer... convert using temp buffer
1345 // to calculate destination buffer requirement
1350 outbuf
= 8*SIZEOF_WCHAR_T
;
1353 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1354 (char**)&bufPtr
, &outbuf
);
1356 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1357 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1360 if (ICONV_FAILED(cres
, inbuf
))
1362 //VS: it is ok if iconv fails, hence trace only
1363 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1370 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1373 // NB: explained in MB2WC
1374 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1377 size_t inbuf
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1381 wchar_t *tmpbuf
= 0;
1385 // need to copy to temp buffer to switch endianness
1386 // this absolutely doesn't rock!
1387 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1388 // could be in read-only memory, or be accessed in some other thread)
1389 tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
1390 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
1391 WC_BSWAP(tmpbuf
, inbuf
)
1397 // have destination buffer, convert there
1398 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1402 // NB: iconv was given only wcslen(psz) characters on input, and so
1403 // it couldn't convert the trailing zero. Let's do it ourselves
1404 // if there's some room left for it in the output buffer.
1410 // no destination buffer... convert using temp buffer
1411 // to calculate destination buffer requirement
1415 buf
= tbuf
; outbuf
= 16;
1417 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1420 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1428 if (ICONV_FAILED(cres
, inbuf
))
1430 //VS: it is ok if iconv fails, hence trace only
1431 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1438 #endif // HAVE_ICONV
1441 // ============================================================================
1442 // Win32 conversion classes
1443 // ============================================================================
1445 #ifdef wxHAVE_WIN32_MB2WC
1449 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1450 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1453 class wxMBConv_win32
: public wxMBConv
1458 m_CodePage
= CP_ACP
;
1462 wxMBConv_win32(const wxChar
* name
)
1464 m_CodePage
= wxCharsetToCodepage(name
);
1467 wxMBConv_win32(wxFontEncoding encoding
)
1469 m_CodePage
= wxEncodingToCodepage(encoding
);
1473 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1475 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1476 // the behaviour is not compatible with the Unix version (using iconv)
1477 // and break the library itself, e.g. wxTextInputStream::NextChar()
1478 // wouldn't work if reading an incomplete MB char didn't result in an
1480 const size_t len
= ::MultiByteToWideChar
1482 m_CodePage
, // code page
1483 MB_ERR_INVALID_CHARS
, // flags: fall on error
1484 psz
, // input string
1485 -1, // its length (NUL-terminated)
1486 buf
, // output string
1487 buf
? n
: 0 // size of output buffer
1490 // note that it returns count of written chars for buf != NULL and size
1491 // of the needed buffer for buf == NULL so in either case the length of
1492 // the string (which never includes the terminating NUL) is one less
1493 return len
? len
- 1 : (size_t)-1;
1496 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1499 we have a problem here: by default, WideCharToMultiByte() may
1500 replace characters unrepresentable in the target code page with bad
1501 quality approximations such as turning "1/2" symbol (U+00BD) into
1502 "1" for the code pages which don't have it and we, obviously, want
1503 to avoid this at any price
1505 the trouble is that this function does it _silently_, i.e. it won't
1506 even tell us whether it did or not... Win98/2000 and higher provide
1507 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1508 we have to resort to a round trip, i.e. check that converting back
1509 results in the same string -- this is, of course, expensive but
1510 otherwise we simply can't be sure to not garble the data.
1513 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1514 // it doesn't work with CJK encodings (which we test for rather roughly
1515 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1517 BOOL usedDef
wxDUMMY_INITIALIZE(false);
1520 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1522 // it's our lucky day
1523 flags
= WC_NO_BEST_FIT_CHARS
;
1524 pUsedDef
= &usedDef
;
1526 else // old system or unsupported encoding
1532 const size_t len
= ::WideCharToMultiByte
1534 m_CodePage
, // code page
1535 flags
, // either none or no best fit
1536 pwz
, // input string
1537 -1, // it is (wide) NUL-terminated
1538 buf
, // output buffer
1539 buf
? n
: 0, // and its size
1540 NULL
, // default "replacement" char
1541 pUsedDef
// [out] was it used?
1546 // function totally failed
1550 // if we were really converting, check if we succeeded
1555 // check if the conversion failed, i.e. if any replacements
1560 else // we must resort to double tripping...
1562 wxWCharBuffer
wcBuf(n
);
1563 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1564 wcscmp(wcBuf
, pwz
) != 0 )
1566 // we didn't obtain the same thing we started from, hence
1567 // the conversion was lossy and we consider that it failed
1573 // see the comment above for the reason of "len - 1"
1577 bool IsOk() const { return m_CodePage
!= -1; }
1580 static bool CanUseNoBestFit()
1582 static int s_isWin98Or2k
= -1;
1584 if ( s_isWin98Or2k
== -1 )
1587 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1590 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1594 s_isWin98Or2k
= verMaj
>= 5;
1598 // unknown, be conseravtive by default
1602 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1605 return s_isWin98Or2k
== 1;
1611 #endif // wxHAVE_WIN32_MB2WC
1613 // ============================================================================
1614 // Cocoa conversion classes
1615 // ============================================================================
1617 #if defined(__WXCOCOA__)
1619 // RN: There is no UTF-32 support in either Core Foundation or
1620 // Cocoa. Strangely enough, internally Core Foundation uses
1621 // UTF 32 internally quite a bit - its just not public (yet).
1623 #include <CoreFoundation/CFString.h>
1624 #include <CoreFoundation/CFStringEncodingExt.h>
1626 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
1628 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
1629 if ( encoding
== wxFONTENCODING_DEFAULT
)
1631 enc
= CFStringGetSystemEncoding();
1633 else switch( encoding
)
1635 case wxFONTENCODING_ISO8859_1
:
1636 enc
= kCFStringEncodingISOLatin1
;
1638 case wxFONTENCODING_ISO8859_2
:
1639 enc
= kCFStringEncodingISOLatin2
;
1641 case wxFONTENCODING_ISO8859_3
:
1642 enc
= kCFStringEncodingISOLatin3
;
1644 case wxFONTENCODING_ISO8859_4
:
1645 enc
= kCFStringEncodingISOLatin4
;
1647 case wxFONTENCODING_ISO8859_5
:
1648 enc
= kCFStringEncodingISOLatinCyrillic
;
1650 case wxFONTENCODING_ISO8859_6
:
1651 enc
= kCFStringEncodingISOLatinArabic
;
1653 case wxFONTENCODING_ISO8859_7
:
1654 enc
= kCFStringEncodingISOLatinGreek
;
1656 case wxFONTENCODING_ISO8859_8
:
1657 enc
= kCFStringEncodingISOLatinHebrew
;
1659 case wxFONTENCODING_ISO8859_9
:
1660 enc
= kCFStringEncodingISOLatin5
;
1662 case wxFONTENCODING_ISO8859_10
:
1663 enc
= kCFStringEncodingISOLatin6
;
1665 case wxFONTENCODING_ISO8859_11
:
1666 enc
= kCFStringEncodingISOLatinThai
;
1668 case wxFONTENCODING_ISO8859_13
:
1669 enc
= kCFStringEncodingISOLatin7
;
1671 case wxFONTENCODING_ISO8859_14
:
1672 enc
= kCFStringEncodingISOLatin8
;
1674 case wxFONTENCODING_ISO8859_15
:
1675 enc
= kCFStringEncodingISOLatin9
;
1678 case wxFONTENCODING_KOI8
:
1679 enc
= kCFStringEncodingKOI8_R
;
1681 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
1682 enc
= kCFStringEncodingDOSRussian
;
1685 // case wxFONTENCODING_BULGARIAN :
1689 case wxFONTENCODING_CP437
:
1690 enc
=kCFStringEncodingDOSLatinUS
;
1692 case wxFONTENCODING_CP850
:
1693 enc
= kCFStringEncodingDOSLatin1
;
1695 case wxFONTENCODING_CP852
:
1696 enc
= kCFStringEncodingDOSLatin2
;
1698 case wxFONTENCODING_CP855
:
1699 enc
= kCFStringEncodingDOSCyrillic
;
1701 case wxFONTENCODING_CP866
:
1702 enc
=kCFStringEncodingDOSRussian
;
1704 case wxFONTENCODING_CP874
:
1705 enc
= kCFStringEncodingDOSThai
;
1707 case wxFONTENCODING_CP932
:
1708 enc
= kCFStringEncodingDOSJapanese
;
1710 case wxFONTENCODING_CP936
:
1711 enc
=kCFStringEncodingDOSChineseSimplif
;
1713 case wxFONTENCODING_CP949
:
1714 enc
= kCFStringEncodingDOSKorean
;
1716 case wxFONTENCODING_CP950
:
1717 enc
= kCFStringEncodingDOSChineseTrad
;
1719 case wxFONTENCODING_CP1250
:
1720 enc
= kCFStringEncodingWindowsLatin2
;
1722 case wxFONTENCODING_CP1251
:
1723 enc
=kCFStringEncodingWindowsCyrillic
;
1725 case wxFONTENCODING_CP1252
:
1726 enc
=kCFStringEncodingWindowsLatin1
;
1728 case wxFONTENCODING_CP1253
:
1729 enc
= kCFStringEncodingWindowsGreek
;
1731 case wxFONTENCODING_CP1254
:
1732 enc
= kCFStringEncodingWindowsLatin5
;
1734 case wxFONTENCODING_CP1255
:
1735 enc
=kCFStringEncodingWindowsHebrew
;
1737 case wxFONTENCODING_CP1256
:
1738 enc
=kCFStringEncodingWindowsArabic
;
1740 case wxFONTENCODING_CP1257
:
1741 enc
= kCFStringEncodingWindowsBalticRim
;
1743 // This only really encodes to UTF7 (if that) evidently
1744 // case wxFONTENCODING_UTF7 :
1745 // enc = kCFStringEncodingNonLossyASCII ;
1747 case wxFONTENCODING_UTF8
:
1748 enc
= kCFStringEncodingUTF8
;
1750 case wxFONTENCODING_EUC_JP
:
1751 enc
= kCFStringEncodingEUC_JP
;
1753 case wxFONTENCODING_UTF16
:
1754 enc
= kCFStringEncodingUnicode
;
1756 case wxFONTENCODING_MACROMAN
:
1757 enc
= kCFStringEncodingMacRoman
;
1759 case wxFONTENCODING_MACJAPANESE
:
1760 enc
= kCFStringEncodingMacJapanese
;
1762 case wxFONTENCODING_MACCHINESETRAD
:
1763 enc
= kCFStringEncodingMacChineseTrad
;
1765 case wxFONTENCODING_MACKOREAN
:
1766 enc
= kCFStringEncodingMacKorean
;
1768 case wxFONTENCODING_MACARABIC
:
1769 enc
= kCFStringEncodingMacArabic
;
1771 case wxFONTENCODING_MACHEBREW
:
1772 enc
= kCFStringEncodingMacHebrew
;
1774 case wxFONTENCODING_MACGREEK
:
1775 enc
= kCFStringEncodingMacGreek
;
1777 case wxFONTENCODING_MACCYRILLIC
:
1778 enc
= kCFStringEncodingMacCyrillic
;
1780 case wxFONTENCODING_MACDEVANAGARI
:
1781 enc
= kCFStringEncodingMacDevanagari
;
1783 case wxFONTENCODING_MACGURMUKHI
:
1784 enc
= kCFStringEncodingMacGurmukhi
;
1786 case wxFONTENCODING_MACGUJARATI
:
1787 enc
= kCFStringEncodingMacGujarati
;
1789 case wxFONTENCODING_MACORIYA
:
1790 enc
= kCFStringEncodingMacOriya
;
1792 case wxFONTENCODING_MACBENGALI
:
1793 enc
= kCFStringEncodingMacBengali
;
1795 case wxFONTENCODING_MACTAMIL
:
1796 enc
= kCFStringEncodingMacTamil
;
1798 case wxFONTENCODING_MACTELUGU
:
1799 enc
= kCFStringEncodingMacTelugu
;
1801 case wxFONTENCODING_MACKANNADA
:
1802 enc
= kCFStringEncodingMacKannada
;
1804 case wxFONTENCODING_MACMALAJALAM
:
1805 enc
= kCFStringEncodingMacMalayalam
;
1807 case wxFONTENCODING_MACSINHALESE
:
1808 enc
= kCFStringEncodingMacSinhalese
;
1810 case wxFONTENCODING_MACBURMESE
:
1811 enc
= kCFStringEncodingMacBurmese
;
1813 case wxFONTENCODING_MACKHMER
:
1814 enc
= kCFStringEncodingMacKhmer
;
1816 case wxFONTENCODING_MACTHAI
:
1817 enc
= kCFStringEncodingMacThai
;
1819 case wxFONTENCODING_MACLAOTIAN
:
1820 enc
= kCFStringEncodingMacLaotian
;
1822 case wxFONTENCODING_MACGEORGIAN
:
1823 enc
= kCFStringEncodingMacGeorgian
;
1825 case wxFONTENCODING_MACARMENIAN
:
1826 enc
= kCFStringEncodingMacArmenian
;
1828 case wxFONTENCODING_MACCHINESESIMP
:
1829 enc
= kCFStringEncodingMacChineseSimp
;
1831 case wxFONTENCODING_MACTIBETAN
:
1832 enc
= kCFStringEncodingMacTibetan
;
1834 case wxFONTENCODING_MACMONGOLIAN
:
1835 enc
= kCFStringEncodingMacMongolian
;
1837 case wxFONTENCODING_MACETHIOPIC
:
1838 enc
= kCFStringEncodingMacEthiopic
;
1840 case wxFONTENCODING_MACCENTRALEUR
:
1841 enc
= kCFStringEncodingMacCentralEurRoman
;
1843 case wxFONTENCODING_MACVIATNAMESE
:
1844 enc
= kCFStringEncodingMacVietnamese
;
1846 case wxFONTENCODING_MACARABICEXT
:
1847 enc
= kCFStringEncodingMacExtArabic
;
1849 case wxFONTENCODING_MACSYMBOL
:
1850 enc
= kCFStringEncodingMacSymbol
;
1852 case wxFONTENCODING_MACDINGBATS
:
1853 enc
= kCFStringEncodingMacDingbats
;
1855 case wxFONTENCODING_MACTURKISH
:
1856 enc
= kCFStringEncodingMacTurkish
;
1858 case wxFONTENCODING_MACCROATIAN
:
1859 enc
= kCFStringEncodingMacCroatian
;
1861 case wxFONTENCODING_MACICELANDIC
:
1862 enc
= kCFStringEncodingMacIcelandic
;
1864 case wxFONTENCODING_MACROMANIAN
:
1865 enc
= kCFStringEncodingMacRomanian
;
1867 case wxFONTENCODING_MACCELTIC
:
1868 enc
= kCFStringEncodingMacCeltic
;
1870 case wxFONTENCODING_MACGAELIC
:
1871 enc
= kCFStringEncodingMacGaelic
;
1873 // case wxFONTENCODING_MACKEYBOARD :
1874 // enc = kCFStringEncodingMacKeyboardGlyphs ;
1877 // because gcc is picky
1883 class wxMBConv_cocoa
: public wxMBConv
1888 Init(CFStringGetSystemEncoding()) ;
1891 wxMBConv_cocoa(const wxChar
* name
)
1893 Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, false) ) ) ;
1896 wxMBConv_cocoa(wxFontEncoding encoding
)
1898 Init( wxCFStringEncFromFontEnc(encoding
) );
1905 void Init( CFStringEncoding encoding
)
1907 m_encoding
= encoding
;
1910 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
1914 CFStringRef theString
= CFStringCreateWithBytes (
1915 NULL
, //the allocator
1916 (const UInt8
*)szUnConv
,
1919 false //no BOM/external representation
1922 wxASSERT(theString
);
1924 size_t nOutLength
= CFStringGetLength(theString
);
1928 CFRelease(theString
);
1932 CFRange theRange
= { 0, nOutSize
};
1934 #if SIZEOF_WCHAR_T == 4
1935 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
1938 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
1940 CFRelease(theString
);
1942 szUniCharBuffer
[nOutLength
] = '\0' ;
1944 #if SIZEOF_WCHAR_T == 4
1945 wxMBConvUTF16 converter
;
1946 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
1947 delete[] szUniCharBuffer
;
1953 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
1957 size_t nRealOutSize
;
1958 size_t nBufSize
= wxWcslen(szUnConv
);
1959 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
1961 #if SIZEOF_WCHAR_T == 4
1962 wxMBConvUTF16BE converter
;
1963 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
1964 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
1965 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
1966 nBufSize
/= sizeof(UniChar
);
1969 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
1973 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
1976 wxASSERT(theString
);
1978 //Note that CER puts a BOM when converting to unicode
1979 //so we check and use getchars instead in that case
1980 if (m_encoding
== kCFStringEncodingUnicode
)
1983 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
1985 nRealOutSize
= CFStringGetLength(theString
) + 1;
1991 CFRangeMake(0, CFStringGetLength(theString
)),
1993 0, //what to put in characters that can't be converted -
1994 //0 tells CFString to return NULL if it meets such a character
1995 false, //not an external representation
1998 (CFIndex
*) &nRealOutSize
2002 CFRelease(theString
);
2004 #if SIZEOF_WCHAR_T == 4
2005 delete[] szUniBuffer
;
2008 return nRealOutSize
- 1;
2013 return m_encoding
!= kCFStringEncodingInvalidId
&&
2014 CFStringIsEncodingAvailable(m_encoding
);
2018 CFStringEncoding m_encoding
;
2021 #endif // defined(__WXCOCOA__)
2023 // ============================================================================
2024 // Mac conversion classes
2025 // ============================================================================
2027 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2029 class wxMBConv_mac
: public wxMBConv
2034 Init(CFStringGetSystemEncoding()) ;
2037 wxMBConv_mac(const wxChar
* name
)
2039 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, false) ) ) ;
2042 wxMBConv_mac(wxFontEncoding encoding
)
2044 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2049 OSStatus status
= noErr
;
2050 status
= TECDisposeConverter(m_MB2WC_converter
);
2051 status
= TECDisposeConverter(m_WC2MB_converter
);
2055 void Init( TextEncodingBase encoding
)
2057 OSStatus status
= noErr
;
2058 m_char_encoding
= encoding
;
2059 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2061 status
= TECCreateConverter(&m_MB2WC_converter
,
2063 m_unicode_encoding
);
2064 status
= TECCreateConverter(&m_WC2MB_converter
,
2069 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2071 OSStatus status
= noErr
;
2072 ByteCount byteOutLen
;
2073 ByteCount byteInLen
= strlen(psz
) ;
2074 wchar_t *tbuf
= NULL
;
2075 UniChar
* ubuf
= NULL
;
2080 //apple specs say at least 32
2081 n
= wxMax( 32 , byteInLen
) ;
2082 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2084 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2085 #if SIZEOF_WCHAR_T == 4
2086 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2088 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2090 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2091 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2092 #if SIZEOF_WCHAR_T == 4
2093 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2094 // is not properly terminated we get random characters at the end
2095 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2096 wxMBConvUTF16BE converter
;
2097 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2100 res
= byteOutLen
/ sizeof( UniChar
) ;
2105 if ( buf
&& res
< n
)
2111 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2113 OSStatus status
= noErr
;
2114 ByteCount byteOutLen
;
2115 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2121 //apple specs say at least 32
2122 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2123 tbuf
= (char*) malloc( n
) ;
2126 ByteCount byteBufferLen
= n
;
2127 UniChar
* ubuf
= NULL
;
2128 #if SIZEOF_WCHAR_T == 4
2129 wxMBConvUTF16BE converter
;
2130 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2131 byteInLen
= unicharlen
;
2132 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2133 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2135 ubuf
= (UniChar
*) psz
;
2137 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2138 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2139 #if SIZEOF_WCHAR_T == 4
2145 size_t res
= byteOutLen
;
2146 if ( buf
&& res
< n
)
2150 //we need to double-trip to verify it didn't insert any ? in place
2151 //of bogus characters
2152 wxWCharBuffer
wcBuf(n
);
2153 size_t pszlen
= wxWcslen(psz
);
2154 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2155 wxWcslen(wcBuf
) != pszlen
||
2156 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2158 // we didn't obtain the same thing we started from, hence
2159 // the conversion was lossy and we consider that it failed
2168 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2171 TECObjectRef m_MB2WC_converter
;
2172 TECObjectRef m_WC2MB_converter
;
2174 TextEncodingBase m_char_encoding
;
2175 TextEncodingBase m_unicode_encoding
;
2178 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2180 // ============================================================================
2181 // wxEncodingConverter based conversion classes
2182 // ============================================================================
2186 class wxMBConv_wxwin
: public wxMBConv
2191 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2192 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2196 // temporarily just use wxEncodingConverter stuff,
2197 // so that it works while a better implementation is built
2198 wxMBConv_wxwin(const wxChar
* name
)
2201 m_enc
= wxFontMapper::Get()->CharsetToEncoding(name
, false);
2203 m_enc
= wxFONTENCODING_SYSTEM
;
2208 wxMBConv_wxwin(wxFontEncoding enc
)
2215 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2217 size_t inbuf
= strlen(psz
);
2220 if (!m2w
.Convert(psz
,buf
))
2226 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2228 const size_t inbuf
= wxWcslen(psz
);
2231 if (!w2m
.Convert(psz
,buf
))
2238 bool IsOk() const { return m_ok
; }
2241 wxFontEncoding m_enc
;
2242 wxEncodingConverter m2w
, w2m
;
2244 // were we initialized successfully?
2247 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2250 #endif // wxUSE_FONTMAP
2252 // ============================================================================
2253 // wxCSConv implementation
2254 // ============================================================================
2256 void wxCSConv::Init()
2263 wxCSConv::wxCSConv(const wxChar
*charset
)
2272 m_encoding
= wxFONTENCODING_SYSTEM
;
2275 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2277 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2279 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2281 encoding
= wxFONTENCODING_SYSTEM
;
2286 m_encoding
= encoding
;
2289 wxCSConv::~wxCSConv()
2294 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2299 SetName(conv
.m_name
);
2300 m_encoding
= conv
.m_encoding
;
2303 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2307 SetName(conv
.m_name
);
2308 m_encoding
= conv
.m_encoding
;
2313 void wxCSConv::Clear()
2322 void wxCSConv::SetName(const wxChar
*charset
)
2326 m_name
= wxStrdup(charset
);
2331 wxMBConv
*wxCSConv::DoCreate() const
2333 // check for the special case of ASCII or ISO8859-1 charset: as we have
2334 // special knowledge of it anyhow, we don't need to create a special
2335 // conversion object
2336 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
2338 // don't convert at all
2342 // we trust OS to do conversion better than we can so try external
2343 // conversion methods first
2345 // the full order is:
2346 // 1. OS conversion (iconv() under Unix or Win32 API)
2347 // 2. hard coded conversions for UTF
2348 // 3. wxEncodingConverter as fall back
2354 #endif // !wxUSE_FONTMAP
2356 wxString
name(m_name
);
2360 name
= wxFontMapper::Get()->GetEncodingName(m_encoding
);
2361 #endif // wxUSE_FONTMAP
2363 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
2369 #endif // HAVE_ICONV
2371 #ifdef wxHAVE_WIN32_MB2WC
2374 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2375 : new wxMBConv_win32(m_encoding
);
2384 #endif // wxHAVE_WIN32_MB2WC
2385 #if defined(__WXMAC__)
2387 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
) )
2390 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
2391 : new wxMBConv_mac(m_encoding
);
2399 #if defined(__WXCOCOA__)
2401 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
2404 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
2405 : new wxMBConv_cocoa(m_encoding
);
2414 wxFontEncoding enc
= m_encoding
;
2416 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2418 // use "false" to suppress interactive dialogs -- we can be called from
2419 // anywhere and popping up a dialog from here is the last thing we want to
2421 enc
= wxFontMapper::Get()->CharsetToEncoding(m_name
, false);
2423 #endif // wxUSE_FONTMAP
2427 case wxFONTENCODING_UTF7
:
2428 return new wxMBConvUTF7
;
2430 case wxFONTENCODING_UTF8
:
2431 return new wxMBConvUTF8
;
2433 case wxFONTENCODING_UTF16BE
:
2434 return new wxMBConvUTF16BE
;
2436 case wxFONTENCODING_UTF16LE
:
2437 return new wxMBConvUTF16LE
;
2439 case wxFONTENCODING_UTF32BE
:
2440 return new wxMBConvUTF32BE
;
2442 case wxFONTENCODING_UTF32LE
:
2443 return new wxMBConvUTF32LE
;
2446 // nothing to do but put here to suppress gcc warnings
2453 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2454 : new wxMBConv_wxwin(m_encoding
);
2460 #endif // wxUSE_FONTMAP
2462 // NB: This is a hack to prevent deadlock. What could otherwise happen
2463 // in Unicode build: wxConvLocal creation ends up being here
2464 // because of some failure and logs the error. But wxLog will try to
2465 // attach timestamp, for which it will need wxConvLocal (to convert
2466 // time to char* and then wchar_t*), but that fails, tries to log
2467 // error, but wxLog has a (already locked) critical section that
2468 // guards static buffer.
2469 static bool alreadyLoggingError
= false;
2470 if (!alreadyLoggingError
)
2472 alreadyLoggingError
= true;
2473 wxLogError(_("Cannot convert from the charset '%s'!"),
2477 wxFontMapper::GetEncodingDescription(m_encoding
).c_str()
2478 #else // !wxUSE_FONTMAP
2479 wxString::Format(_("encoding %s"), m_encoding
).c_str()
2480 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2482 alreadyLoggingError
= false;
2488 void wxCSConv::CreateConvIfNeeded() const
2492 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
2495 // if we don't have neither the name nor the encoding, use the default
2496 // encoding for this system
2497 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
2499 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
2501 #endif // wxUSE_INTL
2503 self
->m_convReal
= DoCreate();
2504 self
->m_deferred
= false;
2508 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2510 CreateConvIfNeeded();
2513 return m_convReal
->MB2WC(buf
, psz
, n
);
2516 size_t len
= strlen(psz
);
2520 for (size_t c
= 0; c
<= len
; c
++)
2521 buf
[c
] = (unsigned char)(psz
[c
]);
2527 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2529 CreateConvIfNeeded();
2532 return m_convReal
->WC2MB(buf
, psz
, n
);
2535 const size_t len
= wxWcslen(psz
);
2538 for (size_t c
= 0; c
<= len
; c
++)
2542 buf
[c
] = (char)psz
[c
];
2547 for (size_t c
= 0; c
<= len
; c
++)
2557 // ----------------------------------------------------------------------------
2559 // ----------------------------------------------------------------------------
2562 static wxMBConv_win32 wxConvLibcObj
;
2563 #elif defined(__WXMAC__) && !defined(__MACH__)
2564 static wxMBConv_mac wxConvLibcObj
;
2566 static wxMBConvLibc wxConvLibcObj
;
2569 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
2570 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
2571 static wxMBConvUTF7 wxConvUTF7Obj
;
2572 static wxMBConvUTF8 wxConvUTF8Obj
;
2575 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
2576 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
2577 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
2578 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
2579 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
2580 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
2582 #else // !wxUSE_WCHAR_T
2584 // stand-ins in absence of wchar_t
2585 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
2590 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T