1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
39 #include "wx/strconv.h"
44 #include "wx/msw/private.h"
48 #include "wx/msw/missing.h"
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
63 // ----------------------------------------------------------------------------
65 // ----------------------------------------------------------------------------
73 #include "wx/thread.h"
76 #include "wx/encconv.h"
77 #include "wx/fontmap.h"
81 #include <ATSUnicode.h>
82 #include <TextCommon.h>
83 #include <TextEncodingConverter.h>
85 #include "wx/mac/private.h" // includes mac headers
87 // ----------------------------------------------------------------------------
89 // ----------------------------------------------------------------------------
91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
94 #if SIZEOF_WCHAR_T == 4
95 #define WC_NAME "UCS4"
96 #define WC_BSWAP BSWAP_UCS4
97 #ifdef WORDS_BIGENDIAN
98 #define WC_NAME_BEST "UCS-4BE"
100 #define WC_NAME_BEST "UCS-4LE"
102 #elif SIZEOF_WCHAR_T == 2
103 #define WC_NAME "UTF16"
104 #define WC_BSWAP BSWAP_UTF16
106 #ifdef WORDS_BIGENDIAN
107 #define WC_NAME_BEST "UTF-16BE"
109 #define WC_NAME_BEST "UTF-16LE"
111 #else // sizeof(wchar_t) != 2 nor 4
112 // does this ever happen?
113 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
116 // ============================================================================
118 // ============================================================================
120 // ----------------------------------------------------------------------------
121 // UTF-16 en/decoding to/from UCS-4
122 // ----------------------------------------------------------------------------
125 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
130 *output
= (wxUint16
) input
;
133 else if (input
>=0x110000)
141 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
142 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
148 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
150 if ((*input
<0xd800) || (*input
>0xdfff))
155 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
162 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
168 // ----------------------------------------------------------------------------
170 // ----------------------------------------------------------------------------
172 wxMBConv::~wxMBConv()
174 // nothing to do here (necessary for Darwin linking probably)
177 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
181 // calculate the length of the buffer needed first
182 size_t nLen
= MB2WC(NULL
, psz
, 0);
183 if ( nLen
!= (size_t)-1 )
185 // now do the actual conversion
186 wxWCharBuffer
buf(nLen
);
187 nLen
= MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NULL
188 if ( nLen
!= (size_t)-1 )
195 wxWCharBuffer
buf((wchar_t *)NULL
);
200 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
204 size_t nLen
= WC2MB(NULL
, pwz
, 0);
205 if ( nLen
!= (size_t)-1 )
207 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
208 nLen
= WC2MB(buf
.data(), pwz
, nLen
+ 4);
209 if ( nLen
!= (size_t)-1 )
216 wxCharBuffer
buf((char *)NULL
);
221 const wxWCharBuffer
wxMBConv::cMB2WC(const char *szString
, size_t nStringLen
, size_t* pOutSize
) const
223 wxASSERT(pOutSize
!= NULL
);
225 const char* szEnd
= szString
+ nStringLen
+ 1;
226 const char* szPos
= szString
;
227 const char* szStart
= szPos
;
229 size_t nActualLength
= 0;
230 size_t nCurrentSize
= nStringLen
; //try normal size first (should never resize?)
232 wxWCharBuffer
theBuffer(nCurrentSize
);
234 //Convert the string until the length() is reached, continuing the
235 //loop every time a null character is reached
236 while(szPos
!= szEnd
)
238 wxASSERT(szPos
< szEnd
); //something is _really_ screwed up if this rings true
240 //Get the length of the current (sub)string
241 size_t nLen
= MB2WC(NULL
, szPos
, 0);
243 //Invalid conversion?
244 if( nLen
== (size_t)-1 )
247 theBuffer
.data()[0u] = wxT('\0');
252 //Increase the actual length (+1 for current null character)
253 nActualLength
+= nLen
+ 1;
255 //if buffer too big, realloc the buffer
256 if (nActualLength
> (nCurrentSize
+1))
258 wxWCharBuffer
theNewBuffer(nCurrentSize
<< 1);
259 memcpy(theNewBuffer
.data(), theBuffer
.data(), nCurrentSize
* sizeof(wchar_t));
260 theBuffer
= theNewBuffer
;
264 //Convert the current (sub)string
265 if ( MB2WC(&theBuffer
.data()[szPos
- szStart
], szPos
, nLen
+ 1) == (size_t)-1 )
268 theBuffer
.data()[0u] = wxT('\0');
272 //Increment to next (sub)string
273 //Note that we have to use strlen here instead of nLen
274 //here because XX2XX gives us the size of the output buffer,
275 //not neccessarly the length of the string
276 szPos
+= strlen(szPos
) + 1;
279 //success - return actual length and the buffer
280 *pOutSize
= nActualLength
;
284 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *szString
, size_t nStringLen
, size_t* pOutSize
) const
286 wxASSERT(pOutSize
!= NULL
);
288 const wchar_t* szEnd
= szString
+ nStringLen
+ 1;
289 const wchar_t* szPos
= szString
;
290 const wchar_t* szStart
= szPos
;
292 size_t nActualLength
= 0;
293 size_t nCurrentSize
= nStringLen
<< 2; //try * 4 first
295 wxCharBuffer
theBuffer(nCurrentSize
);
297 //Convert the string until the length() is reached, continuing the
298 //loop every time a null character is reached
299 while(szPos
!= szEnd
)
301 wxASSERT(szPos
< szEnd
); //something is _really_ screwed up if this rings true
303 //Get the length of the current (sub)string
304 size_t nLen
= WC2MB(NULL
, szPos
, 0);
306 //Invalid conversion?
307 if( nLen
== (size_t)-1 )
310 theBuffer
.data()[0u] = wxT('\0');
314 //Increase the actual length (+1 for current null character)
315 nActualLength
+= nLen
+ 1;
317 //if buffer too big, realloc the buffer
318 if (nActualLength
> (nCurrentSize
+1))
320 wxCharBuffer
theNewBuffer(nCurrentSize
<< 1);
321 memcpy(theNewBuffer
.data(), theBuffer
.data(), nCurrentSize
);
322 theBuffer
= theNewBuffer
;
326 //Convert the current (sub)string
327 if(WC2MB(&theBuffer
.data()[szPos
- szStart
], szPos
, nLen
+ 1) == (size_t)-1 )
330 theBuffer
.data()[0u] = wxT('\0');
334 //Increment to next (sub)string
335 //Note that we have to use wxWcslen here instead of nLen
336 //here because XX2XX gives us the size of the output buffer,
337 //not neccessarly the length of the string
338 szPos
+= wxWcslen(szPos
) + 1;
341 //success - return actual length and the buffer
342 *pOutSize
= nActualLength
;
346 // ----------------------------------------------------------------------------
348 // ----------------------------------------------------------------------------
350 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
352 return wxMB2WC(buf
, psz
, n
);
355 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
357 return wxWC2MB(buf
, psz
, n
);
360 // ----------------------------------------------------------------------------
361 // wxConvBrokenFileNames is made for GTK2 in Unicode mode when
362 // files are accidentally written in an encoding which is not
363 // the system encoding. Typically, the system encoding will be
364 // UTF8 but there might be files stored in ISO8859-1 in disk.
365 // ----------------------------------------------------------------------------
367 class wxConvBrokenFileNames
: public wxMBConvLibc
370 virtual size_t MB2WC(wchar_t *outputBuf
, const char *psz
, size_t outputSize
) const;
371 virtual size_t WC2MB(char *outputBuf
, const wchar_t *psz
, size_t outputSize
) const;
374 size_t wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf
, const char *psz
, size_t outputSize
) const
377 if (we find some invalid characters
)
379 Convert to Unicode range
.
383 return wxMBConvLibc::MB2WC( outputBuf
, psz
, outputSize
);
386 size_t wxConvBrokenFileNames::WC2MB(char *outputBuf
, const wchar_t *psz
, size_t outputSize
) const
389 Convert back from Unicode range
.
391 return wxMBConvLibc::WC2MB( outputBuf
, psz
, outputSize
);
394 // ----------------------------------------------------------------------------
396 // ----------------------------------------------------------------------------
398 // Implementation (C) 2004 Fredrik Roubert
401 // BASE64 decoding table
403 static const unsigned char utf7unb64
[] =
405 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
406 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
407 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
408 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
409 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
410 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
411 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
412 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
413 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
414 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
415 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
416 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
417 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
418 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
419 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
420 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
421 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
422 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
423 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
424 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
425 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
426 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
427 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
428 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
429 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
430 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
431 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
432 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
433 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
434 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
435 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
436 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
439 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
443 while (*psz
&& ((!buf
) || (len
< n
)))
445 unsigned char cc
= *psz
++;
453 else if (*psz
== '-')
463 // BASE64 encoded string
467 for (lsb
= false, d
= 0, l
= 0;
468 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff; psz
++)
472 for (l
+= 6; l
>= 8; lsb
= !lsb
)
474 c
= (unsigned char)((d
>> (l
-= 8)) % 256);
483 *buf
= (wchar_t)(c
<< 8);
490 if (buf
&& (len
< n
))
496 // BASE64 encoding table
498 static const unsigned char utf7enb64
[] =
500 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
501 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
502 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
503 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
504 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
505 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
506 'w', 'x', 'y', 'z', '0', '1', '2', '3',
507 '4', '5', '6', '7', '8', '9', '+', '/'
511 // UTF-7 encoding table
513 // 0 - Set D (directly encoded characters)
514 // 1 - Set O (optional direct characters)
515 // 2 - whitespace characters (optional)
516 // 3 - special characters
518 static const unsigned char utf7encode
[128] =
520 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
521 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
522 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
524 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
526 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
527 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
530 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
536 while (*psz
&& ((!buf
) || (len
< n
)))
539 if (cc
< 0x80 && utf7encode
[cc
] < 1)
547 else if (((wxUint32
)cc
) > 0xffff)
549 // no surrogate pair generation (yet?)
560 // BASE64 encode string
561 unsigned int lsb
, d
, l
;
562 for (d
= 0, l
= 0;; psz
++)
564 for (lsb
= 0; lsb
< 2; lsb
++)
567 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
569 for (l
+= 8; l
>= 6; )
573 *buf
++ = utf7enb64
[(d
>> l
) % 64];
578 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
584 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
593 if (buf
&& (len
< n
))
598 // ----------------------------------------------------------------------------
600 // ----------------------------------------------------------------------------
602 static wxUint32 utf8_max
[]=
603 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
605 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
609 while (*psz
&& ((!buf
) || (len
< n
)))
611 unsigned char cc
= *psz
++, fc
= cc
;
613 for (cnt
= 0; fc
& 0x80; cnt
++)
627 // invalid UTF-8 sequence
632 unsigned ocnt
= cnt
- 1;
633 wxUint32 res
= cc
& (0x3f >> cnt
);
637 if ((cc
& 0xC0) != 0x80)
639 // invalid UTF-8 sequence
642 res
= (res
<< 6) | (cc
& 0x3f);
644 if (res
<= utf8_max
[ocnt
])
646 // illegal UTF-8 encoding
650 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
651 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
652 if (pa
== (size_t)-1)
661 #endif // WC_UTF16/!WC_UTF16
665 if (buf
&& (len
< n
))
670 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
674 while (*psz
&& ((!buf
) || (len
< n
)))
678 // cast is ok for WC_UTF16
679 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
680 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
682 cc
=(*psz
++) & 0x7fffffff;
685 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
699 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
701 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
706 if (buf
&& (len
<n
)) *buf
= 0;
714 // ----------------------------------------------------------------------------
716 // ----------------------------------------------------------------------------
718 #ifdef WORDS_BIGENDIAN
719 #define wxMBConvUTF16straight wxMBConvUTF16BE
720 #define wxMBConvUTF16swap wxMBConvUTF16LE
722 #define wxMBConvUTF16swap wxMBConvUTF16BE
723 #define wxMBConvUTF16straight wxMBConvUTF16LE
729 // copy 16bit MB to 16bit String
730 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
734 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
737 *buf
++ = *(wxUint16
*)psz
;
740 psz
+= sizeof(wxUint16
);
742 if (buf
&& len
<n
) *buf
=0;
748 // copy 16bit String to 16bit MB
749 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
753 while (*psz
&& (!buf
|| len
< n
))
757 *(wxUint16
*)buf
= *psz
;
758 buf
+= sizeof(wxUint16
);
760 len
+= sizeof(wxUint16
);
763 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
769 // swap 16bit MB to 16bit String
770 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
774 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
778 ((char *)buf
)[0] = psz
[1];
779 ((char *)buf
)[1] = psz
[0];
783 psz
+= sizeof(wxUint16
);
785 if (buf
&& len
<n
) *buf
=0;
791 // swap 16bit MB to 16bit String
792 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
796 while (*psz
&& (!buf
|| len
< n
))
800 *buf
++ = ((char*)psz
)[1];
801 *buf
++ = ((char*)psz
)[0];
803 len
+= sizeof(wxUint16
);
806 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
815 // copy 16bit MB to 32bit String
816 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
820 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
823 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
824 if (pa
== (size_t)-1)
830 psz
+= pa
* sizeof(wxUint16
);
832 if (buf
&& len
<n
) *buf
=0;
838 // copy 32bit String to 16bit MB
839 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
843 while (*psz
&& (!buf
|| len
< n
))
846 size_t pa
=encode_utf16(*psz
, cc
);
848 if (pa
== (size_t)-1)
853 *(wxUint16
*)buf
= cc
[0];
854 buf
+= sizeof(wxUint16
);
857 *(wxUint16
*)buf
= cc
[1];
858 buf
+= sizeof(wxUint16
);
862 len
+= pa
*sizeof(wxUint16
);
865 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
871 // swap 16bit MB to 32bit String
872 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
876 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
880 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
881 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
883 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
884 if (pa
== (size_t)-1)
891 psz
+= pa
* sizeof(wxUint16
);
893 if (buf
&& len
<n
) *buf
=0;
899 // swap 32bit String to 16bit MB
900 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
904 while (*psz
&& (!buf
|| len
< n
))
907 size_t pa
=encode_utf16(*psz
, cc
);
909 if (pa
== (size_t)-1)
914 *buf
++ = ((char*)cc
)[1];
915 *buf
++ = ((char*)cc
)[0];
918 *buf
++ = ((char*)cc
)[3];
919 *buf
++ = ((char*)cc
)[2];
923 len
+= pa
*sizeof(wxUint16
);
926 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
934 // ----------------------------------------------------------------------------
936 // ----------------------------------------------------------------------------
938 #ifdef WORDS_BIGENDIAN
939 #define wxMBConvUTF32straight wxMBConvUTF32BE
940 #define wxMBConvUTF32swap wxMBConvUTF32LE
942 #define wxMBConvUTF32swap wxMBConvUTF32BE
943 #define wxMBConvUTF32straight wxMBConvUTF32LE
947 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
948 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
953 // copy 32bit MB to 16bit String
954 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
958 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
962 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
963 if (pa
== (size_t)-1)
973 psz
+= sizeof(wxUint32
);
975 if (buf
&& len
<n
) *buf
=0;
981 // copy 16bit String to 32bit MB
982 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
986 while (*psz
&& (!buf
|| len
< n
))
990 // cast is ok for WC_UTF16
991 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
992 if (pa
== (size_t)-1)
997 *(wxUint32
*)buf
= cc
;
998 buf
+= sizeof(wxUint32
);
1000 len
+= sizeof(wxUint32
);
1004 if (buf
&& len
<=n
-sizeof(wxUint32
))
1012 // swap 32bit MB to 16bit String
1013 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1017 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1020 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
1021 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
1026 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
1027 if (pa
== (size_t)-1)
1037 psz
+= sizeof(wxUint32
);
1047 // swap 16bit String to 32bit MB
1048 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1052 while (*psz
&& (!buf
|| len
< n
))
1056 // cast is ok for WC_UTF16
1057 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
1058 if (pa
== (size_t)-1)
1068 len
+= sizeof(wxUint32
);
1072 if (buf
&& len
<=n
-sizeof(wxUint32
))
1081 // copy 32bit MB to 32bit String
1082 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1086 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1089 *buf
++ = *(wxUint32
*)psz
;
1091 psz
+= sizeof(wxUint32
);
1101 // copy 32bit String to 32bit MB
1102 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1106 while (*psz
&& (!buf
|| len
< n
))
1110 *(wxUint32
*)buf
= *psz
;
1111 buf
+= sizeof(wxUint32
);
1114 len
+= sizeof(wxUint32
);
1118 if (buf
&& len
<=n
-sizeof(wxUint32
))
1125 // swap 32bit MB to 32bit String
1126 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1130 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1134 ((char *)buf
)[0] = psz
[3];
1135 ((char *)buf
)[1] = psz
[2];
1136 ((char *)buf
)[2] = psz
[1];
1137 ((char *)buf
)[3] = psz
[0];
1141 psz
+= sizeof(wxUint32
);
1151 // swap 32bit String to 32bit MB
1152 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1156 while (*psz
&& (!buf
|| len
< n
))
1160 *buf
++ = ((char *)psz
)[3];
1161 *buf
++ = ((char *)psz
)[2];
1162 *buf
++ = ((char *)psz
)[1];
1163 *buf
++ = ((char *)psz
)[0];
1165 len
+= sizeof(wxUint32
);
1169 if (buf
&& len
<=n
-sizeof(wxUint32
))
1179 // ============================================================================
1180 // The classes doing conversion using the iconv_xxx() functions
1181 // ============================================================================
1185 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1186 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1187 // (unless there's yet another bug in glibc) the only case when iconv()
1188 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1189 // left in the input buffer -- when _real_ error occurs,
1190 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1192 // [This bug does not appear in glibc 2.2.]
1193 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1194 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1195 (errno != E2BIG || bufLeft != 0))
1197 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1200 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1202 // ----------------------------------------------------------------------------
1203 // wxMBConv_iconv: encapsulates an iconv character set
1204 // ----------------------------------------------------------------------------
1206 class wxMBConv_iconv
: public wxMBConv
1209 wxMBConv_iconv(const wxChar
*name
);
1210 virtual ~wxMBConv_iconv();
1212 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1213 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1216 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
1219 // the iconv handlers used to translate from multibyte to wide char and in
1220 // the other direction
1224 // guards access to m2w and w2m objects
1225 wxMutex m_iconvMutex
;
1229 // the name (for iconv_open()) of a wide char charset -- if none is
1230 // available on this machine, it will remain NULL
1231 static const char *ms_wcCharsetName
;
1233 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1234 // different endian-ness than the native one
1235 static bool ms_wcNeedsSwap
;
1238 const char *wxMBConv_iconv::ms_wcCharsetName
= NULL
;
1239 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1241 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1243 // Do it the hard way
1245 for (size_t i
= 0; i
< wxStrlen(name
)+1; i
++)
1246 cname
[i
] = (char) name
[i
];
1248 // check for charset that represents wchar_t:
1249 if (ms_wcCharsetName
== NULL
)
1251 ms_wcNeedsSwap
= false;
1253 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1254 ms_wcCharsetName
= WC_NAME_BEST
;
1255 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1257 if (m2w
== (iconv_t
)-1)
1259 // try charset w/o bytesex info (e.g. "UCS4")
1260 // and check for bytesex ourselves:
1261 ms_wcCharsetName
= WC_NAME
;
1262 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1264 // last bet, try if it knows WCHAR_T pseudo-charset
1265 if (m2w
== (iconv_t
)-1)
1267 ms_wcCharsetName
= "WCHAR_T";
1268 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1271 if (m2w
!= (iconv_t
)-1)
1273 char buf
[2], *bufPtr
;
1274 wchar_t wbuf
[2], *wbufPtr
;
1282 outsz
= SIZEOF_WCHAR_T
* 2;
1286 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1287 (char**)&wbufPtr
, &outsz
);
1289 if (ICONV_FAILED(res
, insz
))
1291 ms_wcCharsetName
= NULL
;
1292 wxLogLastError(wxT("iconv"));
1293 wxLogError(_("Conversion to charset '%s' doesn't work."), name
);
1297 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1302 ms_wcCharsetName
= NULL
;
1304 // VS: we must not output an error here, since wxWidgets will safely
1305 // fall back to using wxEncodingConverter.
1306 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name
);
1310 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName
, ms_wcNeedsSwap
);
1312 else // we already have ms_wcCharsetName
1314 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1317 // NB: don't ever pass NULL to iconv_open(), it may crash!
1318 if ( ms_wcCharsetName
)
1320 w2m
= iconv_open( cname
, ms_wcCharsetName
);
1328 wxMBConv_iconv::~wxMBConv_iconv()
1330 if ( m2w
!= (iconv_t
)-1 )
1332 if ( w2m
!= (iconv_t
)-1 )
1336 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1339 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1340 // Unfortunately there is a couple of global wxCSConv objects such as
1341 // wxConvLocal that are used all over wx code, so we have to make sure
1342 // the handle is used by at most one thread at the time. Otherwise
1343 // only a few wx classes would be safe to use from non-main threads
1344 // as MB<->WC conversion would fail "randomly".
1345 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1348 size_t inbuf
= strlen(psz
);
1349 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1351 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1352 wchar_t *bufPtr
= buf
;
1353 const char *pszPtr
= psz
;
1357 // have destination buffer, convert there
1359 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1360 (char**)&bufPtr
, &outbuf
);
1361 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1365 // convert to native endianness
1366 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
1369 // NB: iconv was given only strlen(psz) characters on input, and so
1370 // it couldn't convert the trailing zero. Let's do it ourselves
1371 // if there's some room left for it in the output buffer.
1377 // no destination buffer... convert using temp buffer
1378 // to calculate destination buffer requirement
1383 outbuf
= 8*SIZEOF_WCHAR_T
;
1386 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1387 (char**)&bufPtr
, &outbuf
);
1389 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1390 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1393 if (ICONV_FAILED(cres
, inbuf
))
1395 //VS: it is ok if iconv fails, hence trace only
1396 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1403 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1406 // NB: explained in MB2WC
1407 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1410 size_t inbuf
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1414 wchar_t *tmpbuf
= 0;
1418 // need to copy to temp buffer to switch endianness
1419 // this absolutely doesn't rock!
1420 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1421 // could be in read-only memory, or be accessed in some other thread)
1422 tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
1423 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
1424 WC_BSWAP(tmpbuf
, inbuf
)
1430 // have destination buffer, convert there
1431 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1435 // NB: iconv was given only wcslen(psz) characters on input, and so
1436 // it couldn't convert the trailing zero. Let's do it ourselves
1437 // if there's some room left for it in the output buffer.
1443 // no destination buffer... convert using temp buffer
1444 // to calculate destination buffer requirement
1448 buf
= tbuf
; outbuf
= 16;
1450 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1453 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1461 if (ICONV_FAILED(cres
, inbuf
))
1463 //VS: it is ok if iconv fails, hence trace only
1464 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1471 #endif // HAVE_ICONV
1474 // ============================================================================
1475 // Win32 conversion classes
1476 // ============================================================================
1478 #ifdef wxHAVE_WIN32_MB2WC
1482 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1483 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1486 class wxMBConv_win32
: public wxMBConv
1491 m_CodePage
= CP_ACP
;
1495 wxMBConv_win32(const wxChar
* name
)
1497 m_CodePage
= wxCharsetToCodepage(name
);
1500 wxMBConv_win32(wxFontEncoding encoding
)
1502 m_CodePage
= wxEncodingToCodepage(encoding
);
1506 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1508 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1509 // the behaviour is not compatible with the Unix version (using iconv)
1510 // and break the library itself, e.g. wxTextInputStream::NextChar()
1511 // wouldn't work if reading an incomplete MB char didn't result in an
1514 // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1515 // an error (tested under Windows Server 2003) and apparently it is
1516 // done on purpose, i.e. the function accepts any input in this case
1517 // and although I'd prefer to return error on ill-formed output, our
1518 // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1519 // explicitly ill-formed according to RFC 2152) neither so we don't
1520 // even have any fallback here...
1521 int flags
= m_CodePage
== CP_UTF7
? 0 : MB_ERR_INVALID_CHARS
;
1523 const size_t len
= ::MultiByteToWideChar
1525 m_CodePage
, // code page
1526 flags
, // flags: fall on error
1527 psz
, // input string
1528 -1, // its length (NUL-terminated)
1529 buf
, // output string
1530 buf
? n
: 0 // size of output buffer
1533 // note that it returns count of written chars for buf != NULL and size
1534 // of the needed buffer for buf == NULL so in either case the length of
1535 // the string (which never includes the terminating NUL) is one less
1536 return len
? len
- 1 : (size_t)-1;
1539 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1542 we have a problem here: by default, WideCharToMultiByte() may
1543 replace characters unrepresentable in the target code page with bad
1544 quality approximations such as turning "1/2" symbol (U+00BD) into
1545 "1" for the code pages which don't have it and we, obviously, want
1546 to avoid this at any price
1548 the trouble is that this function does it _silently_, i.e. it won't
1549 even tell us whether it did or not... Win98/2000 and higher provide
1550 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1551 we have to resort to a round trip, i.e. check that converting back
1552 results in the same string -- this is, of course, expensive but
1553 otherwise we simply can't be sure to not garble the data.
1556 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1557 // it doesn't work with CJK encodings (which we test for rather roughly
1558 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1560 BOOL usedDef
wxDUMMY_INITIALIZE(false);
1563 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1565 // it's our lucky day
1566 flags
= WC_NO_BEST_FIT_CHARS
;
1567 pUsedDef
= &usedDef
;
1569 else // old system or unsupported encoding
1575 const size_t len
= ::WideCharToMultiByte
1577 m_CodePage
, // code page
1578 flags
, // either none or no best fit
1579 pwz
, // input string
1580 -1, // it is (wide) NUL-terminated
1581 buf
, // output buffer
1582 buf
? n
: 0, // and its size
1583 NULL
, // default "replacement" char
1584 pUsedDef
// [out] was it used?
1589 // function totally failed
1593 // if we were really converting, check if we succeeded
1598 // check if the conversion failed, i.e. if any replacements
1603 else // we must resort to double tripping...
1605 wxWCharBuffer
wcBuf(n
);
1606 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1607 wcscmp(wcBuf
, pwz
) != 0 )
1609 // we didn't obtain the same thing we started from, hence
1610 // the conversion was lossy and we consider that it failed
1616 // see the comment above for the reason of "len - 1"
1620 bool IsOk() const { return m_CodePage
!= -1; }
1623 static bool CanUseNoBestFit()
1625 static int s_isWin98Or2k
= -1;
1627 if ( s_isWin98Or2k
== -1 )
1630 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1633 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1637 s_isWin98Or2k
= verMaj
>= 5;
1641 // unknown, be conseravtive by default
1645 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1648 return s_isWin98Or2k
== 1;
1654 #endif // wxHAVE_WIN32_MB2WC
1656 // ============================================================================
1657 // Cocoa conversion classes
1658 // ============================================================================
1660 #if defined(__WXCOCOA__)
1662 // RN: There is no UTF-32 support in either Core Foundation or
1663 // Cocoa. Strangely enough, internally Core Foundation uses
1664 // UTF 32 internally quite a bit - its just not public (yet).
1666 #include <CoreFoundation/CFString.h>
1667 #include <CoreFoundation/CFStringEncodingExt.h>
1669 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
1671 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
1672 if ( encoding
== wxFONTENCODING_DEFAULT
)
1674 enc
= CFStringGetSystemEncoding();
1676 else switch( encoding
)
1678 case wxFONTENCODING_ISO8859_1
:
1679 enc
= kCFStringEncodingISOLatin1
;
1681 case wxFONTENCODING_ISO8859_2
:
1682 enc
= kCFStringEncodingISOLatin2
;
1684 case wxFONTENCODING_ISO8859_3
:
1685 enc
= kCFStringEncodingISOLatin3
;
1687 case wxFONTENCODING_ISO8859_4
:
1688 enc
= kCFStringEncodingISOLatin4
;
1690 case wxFONTENCODING_ISO8859_5
:
1691 enc
= kCFStringEncodingISOLatinCyrillic
;
1693 case wxFONTENCODING_ISO8859_6
:
1694 enc
= kCFStringEncodingISOLatinArabic
;
1696 case wxFONTENCODING_ISO8859_7
:
1697 enc
= kCFStringEncodingISOLatinGreek
;
1699 case wxFONTENCODING_ISO8859_8
:
1700 enc
= kCFStringEncodingISOLatinHebrew
;
1702 case wxFONTENCODING_ISO8859_9
:
1703 enc
= kCFStringEncodingISOLatin5
;
1705 case wxFONTENCODING_ISO8859_10
:
1706 enc
= kCFStringEncodingISOLatin6
;
1708 case wxFONTENCODING_ISO8859_11
:
1709 enc
= kCFStringEncodingISOLatinThai
;
1711 case wxFONTENCODING_ISO8859_13
:
1712 enc
= kCFStringEncodingISOLatin7
;
1714 case wxFONTENCODING_ISO8859_14
:
1715 enc
= kCFStringEncodingISOLatin8
;
1717 case wxFONTENCODING_ISO8859_15
:
1718 enc
= kCFStringEncodingISOLatin9
;
1721 case wxFONTENCODING_KOI8
:
1722 enc
= kCFStringEncodingKOI8_R
;
1724 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
1725 enc
= kCFStringEncodingDOSRussian
;
1728 // case wxFONTENCODING_BULGARIAN :
1732 case wxFONTENCODING_CP437
:
1733 enc
=kCFStringEncodingDOSLatinUS
;
1735 case wxFONTENCODING_CP850
:
1736 enc
= kCFStringEncodingDOSLatin1
;
1738 case wxFONTENCODING_CP852
:
1739 enc
= kCFStringEncodingDOSLatin2
;
1741 case wxFONTENCODING_CP855
:
1742 enc
= kCFStringEncodingDOSCyrillic
;
1744 case wxFONTENCODING_CP866
:
1745 enc
=kCFStringEncodingDOSRussian
;
1747 case wxFONTENCODING_CP874
:
1748 enc
= kCFStringEncodingDOSThai
;
1750 case wxFONTENCODING_CP932
:
1751 enc
= kCFStringEncodingDOSJapanese
;
1753 case wxFONTENCODING_CP936
:
1754 enc
=kCFStringEncodingDOSChineseSimplif
;
1756 case wxFONTENCODING_CP949
:
1757 enc
= kCFStringEncodingDOSKorean
;
1759 case wxFONTENCODING_CP950
:
1760 enc
= kCFStringEncodingDOSChineseTrad
;
1762 case wxFONTENCODING_CP1250
:
1763 enc
= kCFStringEncodingWindowsLatin2
;
1765 case wxFONTENCODING_CP1251
:
1766 enc
=kCFStringEncodingWindowsCyrillic
;
1768 case wxFONTENCODING_CP1252
:
1769 enc
=kCFStringEncodingWindowsLatin1
;
1771 case wxFONTENCODING_CP1253
:
1772 enc
= kCFStringEncodingWindowsGreek
;
1774 case wxFONTENCODING_CP1254
:
1775 enc
= kCFStringEncodingWindowsLatin5
;
1777 case wxFONTENCODING_CP1255
:
1778 enc
=kCFStringEncodingWindowsHebrew
;
1780 case wxFONTENCODING_CP1256
:
1781 enc
=kCFStringEncodingWindowsArabic
;
1783 case wxFONTENCODING_CP1257
:
1784 enc
= kCFStringEncodingWindowsBalticRim
;
1786 // This only really encodes to UTF7 (if that) evidently
1787 // case wxFONTENCODING_UTF7 :
1788 // enc = kCFStringEncodingNonLossyASCII ;
1790 case wxFONTENCODING_UTF8
:
1791 enc
= kCFStringEncodingUTF8
;
1793 case wxFONTENCODING_EUC_JP
:
1794 enc
= kCFStringEncodingEUC_JP
;
1796 case wxFONTENCODING_UTF16
:
1797 enc
= kCFStringEncodingUnicode
;
1799 case wxFONTENCODING_MACROMAN
:
1800 enc
= kCFStringEncodingMacRoman
;
1802 case wxFONTENCODING_MACJAPANESE
:
1803 enc
= kCFStringEncodingMacJapanese
;
1805 case wxFONTENCODING_MACCHINESETRAD
:
1806 enc
= kCFStringEncodingMacChineseTrad
;
1808 case wxFONTENCODING_MACKOREAN
:
1809 enc
= kCFStringEncodingMacKorean
;
1811 case wxFONTENCODING_MACARABIC
:
1812 enc
= kCFStringEncodingMacArabic
;
1814 case wxFONTENCODING_MACHEBREW
:
1815 enc
= kCFStringEncodingMacHebrew
;
1817 case wxFONTENCODING_MACGREEK
:
1818 enc
= kCFStringEncodingMacGreek
;
1820 case wxFONTENCODING_MACCYRILLIC
:
1821 enc
= kCFStringEncodingMacCyrillic
;
1823 case wxFONTENCODING_MACDEVANAGARI
:
1824 enc
= kCFStringEncodingMacDevanagari
;
1826 case wxFONTENCODING_MACGURMUKHI
:
1827 enc
= kCFStringEncodingMacGurmukhi
;
1829 case wxFONTENCODING_MACGUJARATI
:
1830 enc
= kCFStringEncodingMacGujarati
;
1832 case wxFONTENCODING_MACORIYA
:
1833 enc
= kCFStringEncodingMacOriya
;
1835 case wxFONTENCODING_MACBENGALI
:
1836 enc
= kCFStringEncodingMacBengali
;
1838 case wxFONTENCODING_MACTAMIL
:
1839 enc
= kCFStringEncodingMacTamil
;
1841 case wxFONTENCODING_MACTELUGU
:
1842 enc
= kCFStringEncodingMacTelugu
;
1844 case wxFONTENCODING_MACKANNADA
:
1845 enc
= kCFStringEncodingMacKannada
;
1847 case wxFONTENCODING_MACMALAJALAM
:
1848 enc
= kCFStringEncodingMacMalayalam
;
1850 case wxFONTENCODING_MACSINHALESE
:
1851 enc
= kCFStringEncodingMacSinhalese
;
1853 case wxFONTENCODING_MACBURMESE
:
1854 enc
= kCFStringEncodingMacBurmese
;
1856 case wxFONTENCODING_MACKHMER
:
1857 enc
= kCFStringEncodingMacKhmer
;
1859 case wxFONTENCODING_MACTHAI
:
1860 enc
= kCFStringEncodingMacThai
;
1862 case wxFONTENCODING_MACLAOTIAN
:
1863 enc
= kCFStringEncodingMacLaotian
;
1865 case wxFONTENCODING_MACGEORGIAN
:
1866 enc
= kCFStringEncodingMacGeorgian
;
1868 case wxFONTENCODING_MACARMENIAN
:
1869 enc
= kCFStringEncodingMacArmenian
;
1871 case wxFONTENCODING_MACCHINESESIMP
:
1872 enc
= kCFStringEncodingMacChineseSimp
;
1874 case wxFONTENCODING_MACTIBETAN
:
1875 enc
= kCFStringEncodingMacTibetan
;
1877 case wxFONTENCODING_MACMONGOLIAN
:
1878 enc
= kCFStringEncodingMacMongolian
;
1880 case wxFONTENCODING_MACETHIOPIC
:
1881 enc
= kCFStringEncodingMacEthiopic
;
1883 case wxFONTENCODING_MACCENTRALEUR
:
1884 enc
= kCFStringEncodingMacCentralEurRoman
;
1886 case wxFONTENCODING_MACVIATNAMESE
:
1887 enc
= kCFStringEncodingMacVietnamese
;
1889 case wxFONTENCODING_MACARABICEXT
:
1890 enc
= kCFStringEncodingMacExtArabic
;
1892 case wxFONTENCODING_MACSYMBOL
:
1893 enc
= kCFStringEncodingMacSymbol
;
1895 case wxFONTENCODING_MACDINGBATS
:
1896 enc
= kCFStringEncodingMacDingbats
;
1898 case wxFONTENCODING_MACTURKISH
:
1899 enc
= kCFStringEncodingMacTurkish
;
1901 case wxFONTENCODING_MACCROATIAN
:
1902 enc
= kCFStringEncodingMacCroatian
;
1904 case wxFONTENCODING_MACICELANDIC
:
1905 enc
= kCFStringEncodingMacIcelandic
;
1907 case wxFONTENCODING_MACROMANIAN
:
1908 enc
= kCFStringEncodingMacRomanian
;
1910 case wxFONTENCODING_MACCELTIC
:
1911 enc
= kCFStringEncodingMacCeltic
;
1913 case wxFONTENCODING_MACGAELIC
:
1914 enc
= kCFStringEncodingMacGaelic
;
1916 // case wxFONTENCODING_MACKEYBOARD :
1917 // enc = kCFStringEncodingMacKeyboardGlyphs ;
1920 // because gcc is picky
1926 class wxMBConv_cocoa
: public wxMBConv
1931 Init(CFStringGetSystemEncoding()) ;
1935 wxMBConv_cocoa(const wxChar
* name
)
1937 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
1941 wxMBConv_cocoa(wxFontEncoding encoding
)
1943 Init( wxCFStringEncFromFontEnc(encoding
) );
1950 void Init( CFStringEncoding encoding
)
1952 m_encoding
= encoding
;
1955 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
1959 CFStringRef theString
= CFStringCreateWithBytes (
1960 NULL
, //the allocator
1961 (const UInt8
*)szUnConv
,
1964 false //no BOM/external representation
1967 wxASSERT(theString
);
1969 size_t nOutLength
= CFStringGetLength(theString
);
1973 CFRelease(theString
);
1977 CFRange theRange
= { 0, nOutSize
};
1979 #if SIZEOF_WCHAR_T == 4
1980 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
1983 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
1985 CFRelease(theString
);
1987 szUniCharBuffer
[nOutLength
] = '\0' ;
1989 #if SIZEOF_WCHAR_T == 4
1990 wxMBConvUTF16 converter
;
1991 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
1992 delete[] szUniCharBuffer
;
1998 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2002 size_t nRealOutSize
;
2003 size_t nBufSize
= wxWcslen(szUnConv
);
2004 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2006 #if SIZEOF_WCHAR_T == 4
2007 wxMBConvUTF16BE converter
;
2008 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2009 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
2010 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
2011 nBufSize
/= sizeof(UniChar
);
2014 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2018 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2021 wxASSERT(theString
);
2023 //Note that CER puts a BOM when converting to unicode
2024 //so we check and use getchars instead in that case
2025 if (m_encoding
== kCFStringEncodingUnicode
)
2028 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2030 nRealOutSize
= CFStringGetLength(theString
) + 1;
2036 CFRangeMake(0, CFStringGetLength(theString
)),
2038 0, //what to put in characters that can't be converted -
2039 //0 tells CFString to return NULL if it meets such a character
2040 false, //not an external representation
2043 (CFIndex
*) &nRealOutSize
2047 CFRelease(theString
);
2049 #if SIZEOF_WCHAR_T == 4
2050 delete[] szUniBuffer
;
2053 return nRealOutSize
- 1;
2058 return m_encoding
!= kCFStringEncodingInvalidId
&&
2059 CFStringIsEncodingAvailable(m_encoding
);
2063 CFStringEncoding m_encoding
;
2066 #endif // defined(__WXCOCOA__)
2068 // ============================================================================
2069 // Mac conversion classes
2070 // ============================================================================
2072 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2074 class wxMBConv_mac
: public wxMBConv
2079 Init(CFStringGetSystemEncoding()) ;
2083 wxMBConv_mac(const wxChar
* name
)
2085 Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2089 wxMBConv_mac(wxFontEncoding encoding
)
2091 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2096 OSStatus status
= noErr
;
2097 status
= TECDisposeConverter(m_MB2WC_converter
);
2098 status
= TECDisposeConverter(m_WC2MB_converter
);
2102 void Init( TextEncodingBase encoding
)
2104 OSStatus status
= noErr
;
2105 m_char_encoding
= encoding
;
2106 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2108 status
= TECCreateConverter(&m_MB2WC_converter
,
2110 m_unicode_encoding
);
2111 status
= TECCreateConverter(&m_WC2MB_converter
,
2116 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2118 OSStatus status
= noErr
;
2119 ByteCount byteOutLen
;
2120 ByteCount byteInLen
= strlen(psz
) ;
2121 wchar_t *tbuf
= NULL
;
2122 UniChar
* ubuf
= NULL
;
2127 //apple specs say at least 32
2128 n
= wxMax( 32 , byteInLen
) ;
2129 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2131 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2132 #if SIZEOF_WCHAR_T == 4
2133 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2135 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2137 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2138 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2139 #if SIZEOF_WCHAR_T == 4
2140 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2141 // is not properly terminated we get random characters at the end
2142 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2143 wxMBConvUTF16BE converter
;
2144 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2147 res
= byteOutLen
/ sizeof( UniChar
) ;
2152 if ( buf
&& res
< n
)
2158 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2160 OSStatus status
= noErr
;
2161 ByteCount byteOutLen
;
2162 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2168 //apple specs say at least 32
2169 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2170 tbuf
= (char*) malloc( n
) ;
2173 ByteCount byteBufferLen
= n
;
2174 UniChar
* ubuf
= NULL
;
2175 #if SIZEOF_WCHAR_T == 4
2176 wxMBConvUTF16BE converter
;
2177 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2178 byteInLen
= unicharlen
;
2179 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2180 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2182 ubuf
= (UniChar
*) psz
;
2184 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2185 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2186 #if SIZEOF_WCHAR_T == 4
2192 size_t res
= byteOutLen
;
2193 if ( buf
&& res
< n
)
2197 //we need to double-trip to verify it didn't insert any ? in place
2198 //of bogus characters
2199 wxWCharBuffer
wcBuf(n
);
2200 size_t pszlen
= wxWcslen(psz
);
2201 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2202 wxWcslen(wcBuf
) != pszlen
||
2203 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2205 // we didn't obtain the same thing we started from, hence
2206 // the conversion was lossy and we consider that it failed
2215 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2218 TECObjectRef m_MB2WC_converter
;
2219 TECObjectRef m_WC2MB_converter
;
2221 TextEncodingBase m_char_encoding
;
2222 TextEncodingBase m_unicode_encoding
;
2225 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2227 // ============================================================================
2228 // wxEncodingConverter based conversion classes
2229 // ============================================================================
2233 class wxMBConv_wxwin
: public wxMBConv
2238 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2239 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2243 // temporarily just use wxEncodingConverter stuff,
2244 // so that it works while a better implementation is built
2245 wxMBConv_wxwin(const wxChar
* name
)
2248 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2250 m_enc
= wxFONTENCODING_SYSTEM
;
2255 wxMBConv_wxwin(wxFontEncoding enc
)
2262 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2264 size_t inbuf
= strlen(psz
);
2267 if (!m2w
.Convert(psz
,buf
))
2273 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2275 const size_t inbuf
= wxWcslen(psz
);
2278 if (!w2m
.Convert(psz
,buf
))
2285 bool IsOk() const { return m_ok
; }
2288 wxFontEncoding m_enc
;
2289 wxEncodingConverter m2w
, w2m
;
2291 // were we initialized successfully?
2294 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2297 #endif // wxUSE_FONTMAP
2299 // ============================================================================
2300 // wxCSConv implementation
2301 // ============================================================================
2303 void wxCSConv::Init()
2310 wxCSConv::wxCSConv(const wxChar
*charset
)
2319 m_encoding
= wxFONTENCODING_SYSTEM
;
2322 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2324 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2326 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2328 encoding
= wxFONTENCODING_SYSTEM
;
2333 m_encoding
= encoding
;
2336 wxCSConv::~wxCSConv()
2341 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2346 SetName(conv
.m_name
);
2347 m_encoding
= conv
.m_encoding
;
2350 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2354 SetName(conv
.m_name
);
2355 m_encoding
= conv
.m_encoding
;
2360 void wxCSConv::Clear()
2369 void wxCSConv::SetName(const wxChar
*charset
)
2373 m_name
= wxStrdup(charset
);
2378 wxMBConv
*wxCSConv::DoCreate() const
2380 // check for the special case of ASCII or ISO8859-1 charset: as we have
2381 // special knowledge of it anyhow, we don't need to create a special
2382 // conversion object
2383 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
2385 // don't convert at all
2389 // we trust OS to do conversion better than we can so try external
2390 // conversion methods first
2392 // the full order is:
2393 // 1. OS conversion (iconv() under Unix or Win32 API)
2394 // 2. hard coded conversions for UTF
2395 // 3. wxEncodingConverter as fall back
2401 #endif // !wxUSE_FONTMAP
2403 wxString
name(m_name
);
2407 name
= wxFontMapperBase::Get()->GetEncodingName(m_encoding
);
2408 #endif // wxUSE_FONTMAP
2410 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
2416 #endif // HAVE_ICONV
2418 #ifdef wxHAVE_WIN32_MB2WC
2421 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2422 : new wxMBConv_win32(m_encoding
);
2431 #endif // wxHAVE_WIN32_MB2WC
2432 #if defined(__WXMAC__)
2434 // leave UTF16 and UTF32 to the built-ins of wx
2435 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2436 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2440 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
2441 : new wxMBConv_mac(m_encoding
);
2443 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
2452 #if defined(__WXCOCOA__)
2454 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
2458 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
2459 : new wxMBConv_cocoa(m_encoding
);
2461 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
2471 wxFontEncoding enc
= m_encoding
;
2473 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2475 // use "false" to suppress interactive dialogs -- we can be called from
2476 // anywhere and popping up a dialog from here is the last thing we want to
2478 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2480 #endif // wxUSE_FONTMAP
2484 case wxFONTENCODING_UTF7
:
2485 return new wxMBConvUTF7
;
2487 case wxFONTENCODING_UTF8
:
2488 return new wxMBConvUTF8
;
2490 case wxFONTENCODING_UTF16BE
:
2491 return new wxMBConvUTF16BE
;
2493 case wxFONTENCODING_UTF16LE
:
2494 return new wxMBConvUTF16LE
;
2496 case wxFONTENCODING_UTF32BE
:
2497 return new wxMBConvUTF32BE
;
2499 case wxFONTENCODING_UTF32LE
:
2500 return new wxMBConvUTF32LE
;
2503 // nothing to do but put here to suppress gcc warnings
2510 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2511 : new wxMBConv_wxwin(m_encoding
);
2517 #endif // wxUSE_FONTMAP
2519 // NB: This is a hack to prevent deadlock. What could otherwise happen
2520 // in Unicode build: wxConvLocal creation ends up being here
2521 // because of some failure and logs the error. But wxLog will try to
2522 // attach timestamp, for which it will need wxConvLocal (to convert
2523 // time to char* and then wchar_t*), but that fails, tries to log
2524 // error, but wxLog has a (already locked) critical section that
2525 // guards static buffer.
2526 static bool alreadyLoggingError
= false;
2527 if (!alreadyLoggingError
)
2529 alreadyLoggingError
= true;
2530 wxLogError(_("Cannot convert from the charset '%s'!"),
2534 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
2535 #else // !wxUSE_FONTMAP
2536 wxString::Format(_("encoding %s"), m_encoding
).c_str()
2537 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2539 alreadyLoggingError
= false;
2545 void wxCSConv::CreateConvIfNeeded() const
2549 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
2552 // if we don't have neither the name nor the encoding, use the default
2553 // encoding for this system
2554 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
2556 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
2558 #endif // wxUSE_INTL
2560 self
->m_convReal
= DoCreate();
2561 self
->m_deferred
= false;
2565 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2567 CreateConvIfNeeded();
2570 return m_convReal
->MB2WC(buf
, psz
, n
);
2573 size_t len
= strlen(psz
);
2577 for (size_t c
= 0; c
<= len
; c
++)
2578 buf
[c
] = (unsigned char)(psz
[c
]);
2584 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2586 CreateConvIfNeeded();
2589 return m_convReal
->WC2MB(buf
, psz
, n
);
2592 const size_t len
= wxWcslen(psz
);
2595 for (size_t c
= 0; c
<= len
; c
++)
2599 buf
[c
] = (char)psz
[c
];
2604 for (size_t c
= 0; c
<= len
; c
++)
2614 // ----------------------------------------------------------------------------
2616 // ----------------------------------------------------------------------------
2619 static wxMBConv_win32 wxConvLibcObj
;
2620 #elif defined(__WXMAC__) && !defined(__MACH__)
2621 static wxMBConv_mac wxConvLibcObj
;
2623 static wxMBConvLibc wxConvLibcObj
;
2626 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
2627 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
2628 static wxMBConvUTF7 wxConvUTF7Obj
;
2629 static wxMBConvUTF8 wxConvUTF8Obj
;
2631 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
2632 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
2633 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
2634 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
2635 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
2636 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
2637 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
2645 #else // !wxUSE_WCHAR_T
2647 // stand-ins in absence of wchar_t
2648 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
2653 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T