1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // ============================================================================
17 // ============================================================================
19 // ----------------------------------------------------------------------------
21 // ----------------------------------------------------------------------------
23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
24 #pragma implementation "strconv.h"
27 // For compilers that support precompilation, includes "wx.h".
28 #include "wx/wxprec.h"
39 #include "wx/strconv.h"
44 #include "wx/msw/private.h"
48 #include "wx/msw/missing.h"
59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
60 #define wxHAVE_WIN32_MB2WC
61 #endif // __WIN32__ but !__WXMICROWIN__
63 // ----------------------------------------------------------------------------
65 // ----------------------------------------------------------------------------
75 #include "wx/encconv.h"
76 #include "wx/fontmap.h"
80 #include <ATSUnicode.h>
81 #include <TextCommon.h>
82 #include <TextEncodingConverter.h>
84 #include "wx/mac/private.h" // includes mac headers
86 // ----------------------------------------------------------------------------
88 // ----------------------------------------------------------------------------
90 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
91 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
93 #if SIZEOF_WCHAR_T == 4
94 #define WC_NAME "UCS4"
95 #define WC_BSWAP BSWAP_UCS4
96 #ifdef WORDS_BIGENDIAN
97 #define WC_NAME_BEST "UCS-4BE"
99 #define WC_NAME_BEST "UCS-4LE"
101 #elif SIZEOF_WCHAR_T == 2
102 #define WC_NAME "UTF16"
103 #define WC_BSWAP BSWAP_UTF16
105 #ifdef WORDS_BIGENDIAN
106 #define WC_NAME_BEST "UTF-16BE"
108 #define WC_NAME_BEST "UTF-16LE"
110 #else // sizeof(wchar_t) != 2 nor 4
111 // does this ever happen?
112 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
115 // ============================================================================
117 // ============================================================================
119 // ----------------------------------------------------------------------------
120 // UTF-16 en/decoding to/from UCS-4
121 // ----------------------------------------------------------------------------
124 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
129 *output
= (wxUint16
) input
;
132 else if (input
>=0x110000)
140 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
141 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
147 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
149 if ((*input
<0xd800) || (*input
>0xdfff))
154 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
161 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
167 // ----------------------------------------------------------------------------
169 // ----------------------------------------------------------------------------
171 wxMBConv::~wxMBConv()
173 // nothing to do here (necessary for Darwin linking probably)
176 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
180 // calculate the length of the buffer needed first
181 size_t nLen
= MB2WC(NULL
, psz
, 0);
182 if ( nLen
!= (size_t)-1 )
184 // now do the actual conversion
185 wxWCharBuffer
buf(nLen
);
186 nLen
= MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NULL
187 if ( nLen
!= (size_t)-1 )
194 wxWCharBuffer
buf((wchar_t *)NULL
);
199 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
203 size_t nLen
= WC2MB(NULL
, pwz
, 0);
204 if ( nLen
!= (size_t)-1 )
206 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
207 nLen
= WC2MB(buf
.data(), pwz
, nLen
+ 4);
208 if ( nLen
!= (size_t)-1 )
215 wxCharBuffer
buf((char *)NULL
);
220 const wxWCharBuffer
wxMBConv::cMB2WC(const char *szString
, size_t nStringLen
, size_t* pOutSize
) const
222 wxASSERT(pOutSize
!= NULL
);
224 const char* szEnd
= szString
+ nStringLen
+ 1;
225 const char* szPos
= szString
;
226 const char* szStart
= szPos
;
228 size_t nActualLength
= 0;
229 size_t nCurrentSize
= nStringLen
; //try normal size first (should never resize?)
231 wxWCharBuffer
theBuffer(nCurrentSize
);
233 //Convert the string until the length() is reached, continuing the
234 //loop every time a null character is reached
235 while(szPos
!= szEnd
)
237 wxASSERT(szPos
< szEnd
); //something is _really_ screwed up if this rings true
239 //Get the length of the current (sub)string
240 size_t nLen
= MB2WC(NULL
, szPos
, 0);
242 //Invalid conversion?
243 if( nLen
== (size_t)-1 )
246 theBuffer
.data()[0u] = wxT('\0');
251 //Increase the actual length (+1 for current null character)
252 nActualLength
+= nLen
+ 1;
254 //if buffer too big, realloc the buffer
255 if (nActualLength
> (nCurrentSize
+1))
257 wxWCharBuffer
theNewBuffer(nCurrentSize
<< 1);
258 memcpy(theNewBuffer
.data(), theBuffer
.data(), nCurrentSize
* sizeof(wchar_t));
259 theBuffer
= theNewBuffer
;
263 //Convert the current (sub)string
264 if ( MB2WC(&theBuffer
.data()[szPos
- szStart
], szPos
, nLen
+ 1) == (size_t)-1 )
267 theBuffer
.data()[0u] = wxT('\0');
271 //Increment to next (sub)string
272 //Note that we have to use strlen here instead of nLen
273 //here because XX2XX gives us the size of the output buffer,
274 //not neccessarly the length of the string
275 szPos
+= strlen(szPos
) + 1;
278 //success - return actual length and the buffer
279 *pOutSize
= nActualLength
;
283 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *szString
, size_t nStringLen
, size_t* pOutSize
) const
285 wxASSERT(pOutSize
!= NULL
);
287 const wchar_t* szEnd
= szString
+ nStringLen
+ 1;
288 const wchar_t* szPos
= szString
;
289 const wchar_t* szStart
= szPos
;
291 size_t nActualLength
= 0;
292 size_t nCurrentSize
= nStringLen
<< 2; //try * 4 first
294 wxCharBuffer
theBuffer(nCurrentSize
);
296 //Convert the string until the length() is reached, continuing the
297 //loop every time a null character is reached
298 while(szPos
!= szEnd
)
300 wxASSERT(szPos
< szEnd
); //something is _really_ screwed up if this rings true
302 //Get the length of the current (sub)string
303 size_t nLen
= WC2MB(NULL
, szPos
, 0);
305 //Invalid conversion?
306 if( nLen
== (size_t)-1 )
309 theBuffer
.data()[0u] = wxT('\0');
313 //Increase the actual length (+1 for current null character)
314 nActualLength
+= nLen
+ 1;
316 //if buffer too big, realloc the buffer
317 if (nActualLength
> (nCurrentSize
+1))
319 wxCharBuffer
theNewBuffer(nCurrentSize
<< 1);
320 memcpy(theNewBuffer
.data(), theBuffer
.data(), nCurrentSize
);
321 theBuffer
= theNewBuffer
;
325 //Convert the current (sub)string
326 if(WC2MB(&theBuffer
.data()[szPos
- szStart
], szPos
, nLen
+ 1) == (size_t)-1 )
329 theBuffer
.data()[0u] = wxT('\0');
333 //Increment to next (sub)string
334 //Note that we have to use wxWcslen here instead of nLen
335 //here because XX2XX gives us the size of the output buffer,
336 //not neccessarly the length of the string
337 szPos
+= wxWcslen(szPos
) + 1;
340 //success - return actual length and the buffer
341 *pOutSize
= nActualLength
;
345 // ----------------------------------------------------------------------------
347 // ----------------------------------------------------------------------------
349 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
351 return wxMB2WC(buf
, psz
, n
);
354 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
356 return wxWC2MB(buf
, psz
, n
);
358 // ----------------------------------------------------------------------------
360 // ----------------------------------------------------------------------------
362 // Implementation (C) 2004 Fredrik Roubert
365 // BASE64 decoding table
367 static const unsigned char utf7unb64
[] =
369 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
370 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
371 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
372 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
373 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
374 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
375 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
376 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
377 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
378 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
379 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
380 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
381 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
382 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
383 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
384 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
385 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
386 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
387 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
388 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
389 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
390 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
391 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
392 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
393 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
394 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
395 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
396 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
397 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
398 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
399 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
400 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
403 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
408 while (*psz
&& ((!buf
) || (len
< n
)))
410 unsigned char cc
= *psz
++;
418 else if (*psz
== '-')
428 // BASE64 encoded string
432 for (lsb
= false, d
= 0, l
= 0;
433 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff; psz
++)
437 for (l
+= 6; l
>= 8; lsb
= !lsb
)
439 c
= (unsigned char)((d
>> (l
-= 8)) % 256);
448 *buf
= (wchar_t)(c
<< 8);
455 if (buf
&& (len
< n
))
461 // BASE64 encoding table
463 static const unsigned char utf7enb64
[] =
465 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
466 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
467 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
468 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
469 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
470 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
471 'w', 'x', 'y', 'z', '0', '1', '2', '3',
472 '4', '5', '6', '7', '8', '9', '+', '/'
476 // UTF-7 encoding table
478 // 0 - Set D (directly encoded characters)
479 // 1 - Set O (optional direct characters)
480 // 2 - whitespace characters (optional)
481 // 3 - special characters
483 static const unsigned char utf7encode
[128] =
485 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
486 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
487 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
489 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
491 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
495 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t
496 *psz
, size_t n
) const
502 while (*psz
&& ((!buf
) || (len
< n
)))
505 if (cc
< 0x80 && utf7encode
[cc
] < 1)
513 else if (((wxUint32
)cc
) > 0xffff)
515 // no surrogate pair generation (yet?)
526 // BASE64 encode string
527 unsigned int lsb
, d
, l
;
528 for (d
= 0, l
= 0;; psz
++)
530 for (lsb
= 0; lsb
< 2; lsb
++)
533 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
535 for (l
+= 8; l
>= 6; )
539 *buf
++ = utf7enb64
[(d
>> l
) % 64];
544 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
550 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
559 if (buf
&& (len
< n
))
564 // ----------------------------------------------------------------------------
566 // ----------------------------------------------------------------------------
568 static wxUint32 utf8_max
[]=
569 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
571 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
575 while (*psz
&& ((!buf
) || (len
< n
)))
577 unsigned char cc
= *psz
++, fc
= cc
;
579 for (cnt
= 0; fc
& 0x80; cnt
++)
593 // invalid UTF-8 sequence
598 unsigned ocnt
= cnt
- 1;
599 wxUint32 res
= cc
& (0x3f >> cnt
);
603 if ((cc
& 0xC0) != 0x80)
605 // invalid UTF-8 sequence
608 res
= (res
<< 6) | (cc
& 0x3f);
610 if (res
<= utf8_max
[ocnt
])
612 // illegal UTF-8 encoding
616 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
617 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
618 if (pa
== (size_t)-1)
627 #endif // WC_UTF16/!WC_UTF16
631 if (buf
&& (len
< n
))
636 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
640 while (*psz
&& ((!buf
) || (len
< n
)))
644 // cast is ok for WC_UTF16
645 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
646 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
648 cc
=(*psz
++) & 0x7fffffff;
651 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
665 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
667 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
672 if (buf
&& (len
<n
)) *buf
= 0;
680 // ----------------------------------------------------------------------------
682 // ----------------------------------------------------------------------------
684 #ifdef WORDS_BIGENDIAN
685 #define wxMBConvUTF16straight wxMBConvUTF16BE
686 #define wxMBConvUTF16swap wxMBConvUTF16LE
688 #define wxMBConvUTF16swap wxMBConvUTF16BE
689 #define wxMBConvUTF16straight wxMBConvUTF16LE
695 // copy 16bit MB to 16bit String
696 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
700 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
703 *buf
++ = *(wxUint16
*)psz
;
706 psz
+= sizeof(wxUint16
);
708 if (buf
&& len
<n
) *buf
=0;
714 // copy 16bit String to 16bit MB
715 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
719 while (*psz
&& (!buf
|| len
< n
))
723 *(wxUint16
*)buf
= *psz
;
724 buf
+= sizeof(wxUint16
);
726 len
+= sizeof(wxUint16
);
729 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
735 // swap 16bit MB to 16bit String
736 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
740 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
744 ((char *)buf
)[0] = psz
[1];
745 ((char *)buf
)[1] = psz
[0];
749 psz
+= sizeof(wxUint16
);
751 if (buf
&& len
<n
) *buf
=0;
757 // swap 16bit MB to 16bit String
758 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
762 while (*psz
&& (!buf
|| len
< n
))
766 *buf
++ = ((char*)psz
)[1];
767 *buf
++ = ((char*)psz
)[0];
769 len
+= sizeof(wxUint16
);
772 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
781 // copy 16bit MB to 32bit String
782 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
786 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
789 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
790 if (pa
== (size_t)-1)
796 psz
+= pa
* sizeof(wxUint16
);
798 if (buf
&& len
<n
) *buf
=0;
804 // copy 32bit String to 16bit MB
805 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
809 while (*psz
&& (!buf
|| len
< n
))
812 size_t pa
=encode_utf16(*psz
, cc
);
814 if (pa
== (size_t)-1)
819 *(wxUint16
*)buf
= cc
[0];
820 buf
+= sizeof(wxUint16
);
823 *(wxUint16
*)buf
= cc
[1];
824 buf
+= sizeof(wxUint16
);
828 len
+= pa
*sizeof(wxUint16
);
831 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
837 // swap 16bit MB to 32bit String
838 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
842 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
846 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
847 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
849 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
850 if (pa
== (size_t)-1)
857 psz
+= pa
* sizeof(wxUint16
);
859 if (buf
&& len
<n
) *buf
=0;
865 // swap 32bit String to 16bit MB
866 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
870 while (*psz
&& (!buf
|| len
< n
))
873 size_t pa
=encode_utf16(*psz
, cc
);
875 if (pa
== (size_t)-1)
880 *buf
++ = ((char*)cc
)[1];
881 *buf
++ = ((char*)cc
)[0];
884 *buf
++ = ((char*)cc
)[3];
885 *buf
++ = ((char*)cc
)[2];
889 len
+= pa
*sizeof(wxUint16
);
892 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
900 // ----------------------------------------------------------------------------
902 // ----------------------------------------------------------------------------
904 #ifdef WORDS_BIGENDIAN
905 #define wxMBConvUTF32straight wxMBConvUTF32BE
906 #define wxMBConvUTF32swap wxMBConvUTF32LE
908 #define wxMBConvUTF32swap wxMBConvUTF32BE
909 #define wxMBConvUTF32straight wxMBConvUTF32LE
913 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
914 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
919 // copy 32bit MB to 16bit String
920 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
924 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
928 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
929 if (pa
== (size_t)-1)
939 psz
+= sizeof(wxUint32
);
941 if (buf
&& len
<n
) *buf
=0;
947 // copy 16bit String to 32bit MB
948 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
952 while (*psz
&& (!buf
|| len
< n
))
956 // cast is ok for WC_UTF16
957 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
958 if (pa
== (size_t)-1)
963 *(wxUint32
*)buf
= cc
;
964 buf
+= sizeof(wxUint32
);
966 len
+= sizeof(wxUint32
);
970 if (buf
&& len
<=n
-sizeof(wxUint32
))
978 // swap 32bit MB to 16bit String
979 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
983 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
986 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
987 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
992 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
993 if (pa
== (size_t)-1)
1003 psz
+= sizeof(wxUint32
);
1013 // swap 16bit String to 32bit MB
1014 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1018 while (*psz
&& (!buf
|| len
< n
))
1022 // cast is ok for WC_UTF16
1023 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
1024 if (pa
== (size_t)-1)
1034 len
+= sizeof(wxUint32
);
1038 if (buf
&& len
<=n
-sizeof(wxUint32
))
1047 // copy 32bit MB to 32bit String
1048 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1052 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1055 *buf
++ = *(wxUint32
*)psz
;
1057 psz
+= sizeof(wxUint32
);
1067 // copy 32bit String to 32bit MB
1068 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1072 while (*psz
&& (!buf
|| len
< n
))
1076 *(wxUint32
*)buf
= *psz
;
1077 buf
+= sizeof(wxUint32
);
1080 len
+= sizeof(wxUint32
);
1084 if (buf
&& len
<=n
-sizeof(wxUint32
))
1091 // swap 32bit MB to 32bit String
1092 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1096 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
1100 ((char *)buf
)[0] = psz
[3];
1101 ((char *)buf
)[1] = psz
[2];
1102 ((char *)buf
)[2] = psz
[1];
1103 ((char *)buf
)[3] = psz
[0];
1107 psz
+= sizeof(wxUint32
);
1117 // swap 32bit String to 32bit MB
1118 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1122 while (*psz
&& (!buf
|| len
< n
))
1126 *buf
++ = ((char *)psz
)[3];
1127 *buf
++ = ((char *)psz
)[2];
1128 *buf
++ = ((char *)psz
)[1];
1129 *buf
++ = ((char *)psz
)[0];
1131 len
+= sizeof(wxUint32
);
1135 if (buf
&& len
<=n
-sizeof(wxUint32
))
1145 // ============================================================================
1146 // The classes doing conversion using the iconv_xxx() functions
1147 // ============================================================================
1151 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
1152 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
1153 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
1154 // (which means error) and says there are 0 bytes left in the input buffer --
1155 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
1156 // this alternative test for iconv() failure.
1157 // [This bug does not appear in glibc 2.2.]
1158 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1159 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1160 (errno != E2BIG || bufLeft != 0))
1162 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1165 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1167 // ----------------------------------------------------------------------------
1168 // wxMBConv_iconv: encapsulates an iconv character set
1169 // ----------------------------------------------------------------------------
1171 class wxMBConv_iconv
: public wxMBConv
1174 wxMBConv_iconv(const wxChar
*name
);
1175 virtual ~wxMBConv_iconv();
1177 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1178 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1181 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
1184 // the iconv handlers used to translate from multibyte to wide char and in
1185 // the other direction
1190 // the name (for iconv_open()) of a wide char charset -- if none is
1191 // available on this machine, it will remain NULL
1192 static const char *ms_wcCharsetName
;
1194 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1195 // different endian-ness than the native one
1196 static bool ms_wcNeedsSwap
;
1199 const char *wxMBConv_iconv::ms_wcCharsetName
= NULL
;
1200 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1202 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1204 // Do it the hard way
1206 for (size_t i
= 0; i
< wxStrlen(name
)+1; i
++)
1207 cname
[i
] = (char) name
[i
];
1209 // check for charset that represents wchar_t:
1210 if (ms_wcCharsetName
== NULL
)
1212 ms_wcNeedsSwap
= false;
1214 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1215 ms_wcCharsetName
= WC_NAME_BEST
;
1216 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1218 if (m2w
== (iconv_t
)-1)
1220 // try charset w/o bytesex info (e.g. "UCS4")
1221 // and check for bytesex ourselves:
1222 ms_wcCharsetName
= WC_NAME
;
1223 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1225 // last bet, try if it knows WCHAR_T pseudo-charset
1226 if (m2w
== (iconv_t
)-1)
1228 ms_wcCharsetName
= "WCHAR_T";
1229 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1232 if (m2w
!= (iconv_t
)-1)
1234 char buf
[2], *bufPtr
;
1235 wchar_t wbuf
[2], *wbufPtr
;
1243 outsz
= SIZEOF_WCHAR_T
* 2;
1247 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1248 (char**)&wbufPtr
, &outsz
);
1250 if (ICONV_FAILED(res
, insz
))
1252 ms_wcCharsetName
= NULL
;
1253 wxLogLastError(wxT("iconv"));
1254 wxLogError(_("Conversion to charset '%s' doesn't work."), name
);
1258 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1263 ms_wcCharsetName
= NULL
;
1265 // VS: we must not output an error here, since wxWidgets will safely
1266 // fall back to using wxEncodingConverter.
1267 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name
);
1271 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName
, ms_wcNeedsSwap
);
1273 else // we already have ms_wcCharsetName
1275 m2w
= iconv_open(ms_wcCharsetName
, cname
);
1278 // NB: don't ever pass NULL to iconv_open(), it may crash!
1279 if ( ms_wcCharsetName
)
1281 w2m
= iconv_open( cname
, ms_wcCharsetName
);
1289 wxMBConv_iconv::~wxMBConv_iconv()
1291 if ( m2w
!= (iconv_t
)-1 )
1293 if ( w2m
!= (iconv_t
)-1 )
1297 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1299 size_t inbuf
= strlen(psz
);
1300 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1302 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1303 wchar_t *bufPtr
= buf
;
1304 const char *pszPtr
= psz
;
1308 // have destination buffer, convert there
1310 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1311 (char**)&bufPtr
, &outbuf
);
1312 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1316 // convert to native endianness
1317 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
1320 // NB: iconv was given only strlen(psz) characters on input, and so
1321 // it couldn't convert the trailing zero. Let's do it ourselves
1322 // if there's some room left for it in the output buffer.
1328 // no destination buffer... convert using temp buffer
1329 // to calculate destination buffer requirement
1334 outbuf
= 8*SIZEOF_WCHAR_T
;
1337 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1338 (char**)&bufPtr
, &outbuf
);
1340 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1341 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1344 if (ICONV_FAILED(cres
, inbuf
))
1346 //VS: it is ok if iconv fails, hence trace only
1347 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1354 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1356 size_t inbuf
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1360 wchar_t *tmpbuf
= 0;
1364 // need to copy to temp buffer to switch endianness
1365 // this absolutely doesn't rock!
1366 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1367 // could be in read-only memory, or be accessed in some other thread)
1368 tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
1369 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
1370 WC_BSWAP(tmpbuf
, inbuf
)
1376 // have destination buffer, convert there
1377 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1381 // NB: iconv was given only wcslen(psz) characters on input, and so
1382 // it couldn't convert the trailing zero. Let's do it ourselves
1383 // if there's some room left for it in the output buffer.
1389 // no destination buffer... convert using temp buffer
1390 // to calculate destination buffer requirement
1394 buf
= tbuf
; outbuf
= 16;
1396 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1399 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1407 if (ICONV_FAILED(cres
, inbuf
))
1409 //VS: it is ok if iconv fails, hence trace only
1410 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1417 #endif // HAVE_ICONV
1420 // ============================================================================
1421 // Win32 conversion classes
1422 // ============================================================================
1424 #ifdef wxHAVE_WIN32_MB2WC
1428 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1429 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1432 class wxMBConv_win32
: public wxMBConv
1437 m_CodePage
= CP_ACP
;
1441 wxMBConv_win32(const wxChar
* name
)
1443 m_CodePage
= wxCharsetToCodepage(name
);
1446 wxMBConv_win32(wxFontEncoding encoding
)
1448 m_CodePage
= wxEncodingToCodepage(encoding
);
1452 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1454 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1455 // the behaviour is not compatible with the Unix version (using iconv)
1456 // and break the library itself, e.g. wxTextInputStream::NextChar()
1457 // wouldn't work if reading an incomplete MB char didn't result in an
1459 const size_t len
= ::MultiByteToWideChar
1461 m_CodePage
, // code page
1462 MB_ERR_INVALID_CHARS
, // flags: fall on error
1463 psz
, // input string
1464 -1, // its length (NUL-terminated)
1465 buf
, // output string
1466 buf
? n
: 0 // size of output buffer
1469 // note that it returns count of written chars for buf != NULL and size
1470 // of the needed buffer for buf == NULL so in either case the length of
1471 // the string (which never includes the terminating NUL) is one less
1472 return len
? len
- 1 : (size_t)-1;
1475 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1478 we have a problem here: by default, WideCharToMultiByte() may
1479 replace characters unrepresentable in the target code page with bad
1480 quality approximations such as turning "1/2" symbol (U+00BD) into
1481 "1" for the code pages which don't have it and we, obviously, want
1482 to avoid this at any price
1484 the trouble is that this function does it _silently_, i.e. it won't
1485 even tell us whether it did or not... Win98/2000 and higher provide
1486 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1487 we have to resort to a round trip, i.e. check that converting back
1488 results in the same string -- this is, of course, expensive but
1489 otherwise we simply can't be sure to not garble the data.
1492 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1493 // it doesn't work with CJK encodings (which we test for rather roughly
1494 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1496 BOOL usedDef
wxDUMMY_INITIALIZE(false);
1499 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1501 // it's our lucky day
1502 flags
= WC_NO_BEST_FIT_CHARS
;
1503 pUsedDef
= &usedDef
;
1505 else // old system or unsupported encoding
1511 const size_t len
= ::WideCharToMultiByte
1513 m_CodePage
, // code page
1514 flags
, // either none or no best fit
1515 pwz
, // input string
1516 -1, // it is (wide) NUL-terminated
1517 buf
, // output buffer
1518 buf
? n
: 0, // and its size
1519 NULL
, // default "replacement" char
1520 pUsedDef
// [out] was it used?
1525 // function totally failed
1529 // if we were really converting, check if we succeeded
1534 // check if the conversion failed, i.e. if any replacements
1539 else // we must resort to double tripping...
1541 wxWCharBuffer
wcBuf(n
);
1542 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1543 wcscmp(wcBuf
, pwz
) != 0 )
1545 // we didn't obtain the same thing we started from, hence
1546 // the conversion was lossy and we consider that it failed
1552 // see the comment above for the reason of "len - 1"
1556 bool IsOk() const { return m_CodePage
!= -1; }
1559 static bool CanUseNoBestFit()
1561 static int s_isWin98Or2k
= -1;
1563 if ( s_isWin98Or2k
== -1 )
1566 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1569 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1573 s_isWin98Or2k
= verMaj
>= 5;
1577 // unknown, be conseravtive by default
1581 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1584 return s_isWin98Or2k
== 1;
1590 #endif // wxHAVE_WIN32_MB2WC
1592 // ============================================================================
1593 // Cocoa conversion classes
1594 // ============================================================================
1596 #if defined(__WXCOCOA__)
1598 // RN: There is no UTF-32 support in either Core Foundation or
1599 // Cocoa. Strangely enough, internally Core Foundation uses
1600 // UTF 32 internally quite a bit - its just not public (yet).
1602 #include <CoreFoundation/CFString.h>
1603 #include <CoreFoundation/CFStringEncodingExt.h>
1605 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
1607 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
1608 if ( encoding
== wxFONTENCODING_DEFAULT
)
1610 enc
= CFStringGetSystemEncoding();
1612 else switch( encoding
)
1614 case wxFONTENCODING_ISO8859_1
:
1615 enc
= kCFStringEncodingISOLatin1
;
1617 case wxFONTENCODING_ISO8859_2
:
1618 enc
= kCFStringEncodingISOLatin2
;
1620 case wxFONTENCODING_ISO8859_3
:
1621 enc
= kCFStringEncodingISOLatin3
;
1623 case wxFONTENCODING_ISO8859_4
:
1624 enc
= kCFStringEncodingISOLatin4
;
1626 case wxFONTENCODING_ISO8859_5
:
1627 enc
= kCFStringEncodingISOLatinCyrillic
;
1629 case wxFONTENCODING_ISO8859_6
:
1630 enc
= kCFStringEncodingISOLatinArabic
;
1632 case wxFONTENCODING_ISO8859_7
:
1633 enc
= kCFStringEncodingISOLatinGreek
;
1635 case wxFONTENCODING_ISO8859_8
:
1636 enc
= kCFStringEncodingISOLatinHebrew
;
1638 case wxFONTENCODING_ISO8859_9
:
1639 enc
= kCFStringEncodingISOLatin5
;
1641 case wxFONTENCODING_ISO8859_10
:
1642 enc
= kCFStringEncodingISOLatin6
;
1644 case wxFONTENCODING_ISO8859_11
:
1645 enc
= kCFStringEncodingISOLatinThai
;
1647 case wxFONTENCODING_ISO8859_13
:
1648 enc
= kCFStringEncodingISOLatin7
;
1650 case wxFONTENCODING_ISO8859_14
:
1651 enc
= kCFStringEncodingISOLatin8
;
1653 case wxFONTENCODING_ISO8859_15
:
1654 enc
= kCFStringEncodingISOLatin9
;
1657 case wxFONTENCODING_KOI8
:
1658 enc
= kCFStringEncodingKOI8_R
;
1660 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
1661 enc
= kCFStringEncodingDOSRussian
;
1664 // case wxFONTENCODING_BULGARIAN :
1668 case wxFONTENCODING_CP437
:
1669 enc
=kCFStringEncodingDOSLatinUS
;
1671 case wxFONTENCODING_CP850
:
1672 enc
= kCFStringEncodingDOSLatin1
;
1674 case wxFONTENCODING_CP852
:
1675 enc
= kCFStringEncodingDOSLatin2
;
1677 case wxFONTENCODING_CP855
:
1678 enc
= kCFStringEncodingDOSCyrillic
;
1680 case wxFONTENCODING_CP866
:
1681 enc
=kCFStringEncodingDOSRussian
;
1683 case wxFONTENCODING_CP874
:
1684 enc
= kCFStringEncodingDOSThai
;
1686 case wxFONTENCODING_CP932
:
1687 enc
= kCFStringEncodingDOSJapanese
;
1689 case wxFONTENCODING_CP936
:
1690 enc
=kCFStringEncodingDOSChineseSimplif
;
1692 case wxFONTENCODING_CP949
:
1693 enc
= kCFStringEncodingDOSKorean
;
1695 case wxFONTENCODING_CP950
:
1696 enc
= kCFStringEncodingDOSChineseTrad
;
1698 case wxFONTENCODING_CP1250
:
1699 enc
= kCFStringEncodingWindowsLatin2
;
1701 case wxFONTENCODING_CP1251
:
1702 enc
=kCFStringEncodingWindowsCyrillic
;
1704 case wxFONTENCODING_CP1252
:
1705 enc
=kCFStringEncodingWindowsLatin1
;
1707 case wxFONTENCODING_CP1253
:
1708 enc
= kCFStringEncodingWindowsGreek
;
1710 case wxFONTENCODING_CP1254
:
1711 enc
= kCFStringEncodingWindowsLatin5
;
1713 case wxFONTENCODING_CP1255
:
1714 enc
=kCFStringEncodingWindowsHebrew
;
1716 case wxFONTENCODING_CP1256
:
1717 enc
=kCFStringEncodingWindowsArabic
;
1719 case wxFONTENCODING_CP1257
:
1720 enc
= kCFStringEncodingWindowsBalticRim
;
1722 // This only really encodes to UTF7 (if that) evidently
1723 // case wxFONTENCODING_UTF7 :
1724 // enc = kCFStringEncodingNonLossyASCII ;
1726 case wxFONTENCODING_UTF8
:
1727 enc
= kCFStringEncodingUTF8
;
1729 case wxFONTENCODING_EUC_JP
:
1730 enc
= kCFStringEncodingEUC_JP
;
1732 case wxFONTENCODING_UTF16
:
1733 enc
= kCFStringEncodingUnicode
;
1735 case wxFONTENCODING_MACROMAN
:
1736 enc
= kCFStringEncodingMacRoman
;
1738 case wxFONTENCODING_MACJAPANESE
:
1739 enc
= kCFStringEncodingMacJapanese
;
1741 case wxFONTENCODING_MACCHINESETRAD
:
1742 enc
= kCFStringEncodingMacChineseTrad
;
1744 case wxFONTENCODING_MACKOREAN
:
1745 enc
= kCFStringEncodingMacKorean
;
1747 case wxFONTENCODING_MACARABIC
:
1748 enc
= kCFStringEncodingMacArabic
;
1750 case wxFONTENCODING_MACHEBREW
:
1751 enc
= kCFStringEncodingMacHebrew
;
1753 case wxFONTENCODING_MACGREEK
:
1754 enc
= kCFStringEncodingMacGreek
;
1756 case wxFONTENCODING_MACCYRILLIC
:
1757 enc
= kCFStringEncodingMacCyrillic
;
1759 case wxFONTENCODING_MACDEVANAGARI
:
1760 enc
= kCFStringEncodingMacDevanagari
;
1762 case wxFONTENCODING_MACGURMUKHI
:
1763 enc
= kCFStringEncodingMacGurmukhi
;
1765 case wxFONTENCODING_MACGUJARATI
:
1766 enc
= kCFStringEncodingMacGujarati
;
1768 case wxFONTENCODING_MACORIYA
:
1769 enc
= kCFStringEncodingMacOriya
;
1771 case wxFONTENCODING_MACBENGALI
:
1772 enc
= kCFStringEncodingMacBengali
;
1774 case wxFONTENCODING_MACTAMIL
:
1775 enc
= kCFStringEncodingMacTamil
;
1777 case wxFONTENCODING_MACTELUGU
:
1778 enc
= kCFStringEncodingMacTelugu
;
1780 case wxFONTENCODING_MACKANNADA
:
1781 enc
= kCFStringEncodingMacKannada
;
1783 case wxFONTENCODING_MACMALAJALAM
:
1784 enc
= kCFStringEncodingMacMalayalam
;
1786 case wxFONTENCODING_MACSINHALESE
:
1787 enc
= kCFStringEncodingMacSinhalese
;
1789 case wxFONTENCODING_MACBURMESE
:
1790 enc
= kCFStringEncodingMacBurmese
;
1792 case wxFONTENCODING_MACKHMER
:
1793 enc
= kCFStringEncodingMacKhmer
;
1795 case wxFONTENCODING_MACTHAI
:
1796 enc
= kCFStringEncodingMacThai
;
1798 case wxFONTENCODING_MACLAOTIAN
:
1799 enc
= kCFStringEncodingMacLaotian
;
1801 case wxFONTENCODING_MACGEORGIAN
:
1802 enc
= kCFStringEncodingMacGeorgian
;
1804 case wxFONTENCODING_MACARMENIAN
:
1805 enc
= kCFStringEncodingMacArmenian
;
1807 case wxFONTENCODING_MACCHINESESIMP
:
1808 enc
= kCFStringEncodingMacChineseSimp
;
1810 case wxFONTENCODING_MACTIBETAN
:
1811 enc
= kCFStringEncodingMacTibetan
;
1813 case wxFONTENCODING_MACMONGOLIAN
:
1814 enc
= kCFStringEncodingMacMongolian
;
1816 case wxFONTENCODING_MACETHIOPIC
:
1817 enc
= kCFStringEncodingMacEthiopic
;
1819 case wxFONTENCODING_MACCENTRALEUR
:
1820 enc
= kCFStringEncodingMacCentralEurRoman
;
1822 case wxFONTENCODING_MACVIATNAMESE
:
1823 enc
= kCFStringEncodingMacVietnamese
;
1825 case wxFONTENCODING_MACARABICEXT
:
1826 enc
= kCFStringEncodingMacExtArabic
;
1828 case wxFONTENCODING_MACSYMBOL
:
1829 enc
= kCFStringEncodingMacSymbol
;
1831 case wxFONTENCODING_MACDINGBATS
:
1832 enc
= kCFStringEncodingMacDingbats
;
1834 case wxFONTENCODING_MACTURKISH
:
1835 enc
= kCFStringEncodingMacTurkish
;
1837 case wxFONTENCODING_MACCROATIAN
:
1838 enc
= kCFStringEncodingMacCroatian
;
1840 case wxFONTENCODING_MACICELANDIC
:
1841 enc
= kCFStringEncodingMacIcelandic
;
1843 case wxFONTENCODING_MACROMANIAN
:
1844 enc
= kCFStringEncodingMacRomanian
;
1846 case wxFONTENCODING_MACCELTIC
:
1847 enc
= kCFStringEncodingMacCeltic
;
1849 case wxFONTENCODING_MACGAELIC
:
1850 enc
= kCFStringEncodingMacGaelic
;
1852 // case wxFONTENCODING_MACKEYBOARD :
1853 // enc = kCFStringEncodingMacKeyboardGlyphs ;
1856 // because gcc is picky
1862 class wxMBConv_cocoa
: public wxMBConv
1867 Init(CFStringGetSystemEncoding()) ;
1870 wxMBConv_cocoa(const wxChar
* name
)
1872 Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, false) ) ) ;
1875 wxMBConv_cocoa(wxFontEncoding encoding
)
1877 Init( wxCFStringEncFromFontEnc(encoding
) );
1884 void Init( CFStringEncoding encoding
)
1886 m_encoding
= encoding
;
1889 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
1893 CFStringRef theString
= CFStringCreateWithBytes (
1894 NULL
, //the allocator
1895 (const UInt8
*)szUnConv
,
1898 false //no BOM/external representation
1901 wxASSERT(theString
);
1903 size_t nOutLength
= CFStringGetLength(theString
);
1907 CFRelease(theString
);
1911 CFRange theRange
= { 0, nOutSize
};
1913 #if SIZEOF_WCHAR_T == 4
1914 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
1917 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
1919 CFRelease(theString
);
1921 szUniCharBuffer
[nOutLength
] = '\0' ;
1923 #if SIZEOF_WCHAR_T == 4
1924 wxMBConvUTF16 converter
;
1925 converter
.MB2WC(szOut
, (const char*)szUniCharBuffer
, nOutSize
) ;
1926 delete[] szUniCharBuffer
;
1932 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
1936 size_t nRealOutSize
;
1937 size_t nBufSize
= wxWcslen(szUnConv
);
1938 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
1940 #if SIZEOF_WCHAR_T == 4
1941 wxMBConvUTF16BE converter
;
1942 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
1943 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1] ;
1944 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
)) ;
1945 nBufSize
/= sizeof(UniChar
);
1948 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
1952 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
1955 wxASSERT(theString
);
1957 //Note that CER puts a BOM when converting to unicode
1958 //so we check and use getchars instead in that case
1959 if (m_encoding
== kCFStringEncodingUnicode
)
1962 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
1964 nRealOutSize
= CFStringGetLength(theString
) + 1;
1970 CFRangeMake(0, CFStringGetLength(theString
)),
1972 0, //what to put in characters that can't be converted -
1973 //0 tells CFString to return NULL if it meets such a character
1974 false, //not an external representation
1977 (CFIndex
*) &nRealOutSize
1981 CFRelease(theString
);
1983 #if SIZEOF_WCHAR_T == 4
1984 delete[] szUniBuffer
;
1987 return nRealOutSize
- 1;
1992 return m_encoding
!= kCFStringEncodingInvalidId
&&
1993 CFStringIsEncodingAvailable(m_encoding
);
1997 CFStringEncoding m_encoding
;
2000 #endif // defined(__WXCOCOA__)
2002 // ============================================================================
2003 // Mac conversion classes
2004 // ============================================================================
2006 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2008 class wxMBConv_mac
: public wxMBConv
2013 Init(CFStringGetSystemEncoding()) ;
2016 wxMBConv_mac(const wxChar
* name
)
2018 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, false) ) ) ;
2021 wxMBConv_mac(wxFontEncoding encoding
)
2023 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2028 OSStatus status
= noErr
;
2029 status
= TECDisposeConverter(m_MB2WC_converter
);
2030 status
= TECDisposeConverter(m_WC2MB_converter
);
2034 void Init( TextEncodingBase encoding
)
2036 OSStatus status
= noErr
;
2037 m_char_encoding
= encoding
;
2038 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
2040 status
= TECCreateConverter(&m_MB2WC_converter
,
2042 m_unicode_encoding
);
2043 status
= TECCreateConverter(&m_WC2MB_converter
,
2048 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2050 OSStatus status
= noErr
;
2051 ByteCount byteOutLen
;
2052 ByteCount byteInLen
= strlen(psz
) ;
2053 wchar_t *tbuf
= NULL
;
2054 UniChar
* ubuf
= NULL
;
2059 //apple specs say at least 32
2060 n
= wxMax( 32 , byteInLen
) ;
2061 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2063 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2064 #if SIZEOF_WCHAR_T == 4
2065 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2067 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2069 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2070 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2071 #if SIZEOF_WCHAR_T == 4
2072 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2073 // is not properly terminated we get random characters at the end
2074 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2075 wxMBConvUTF16BE converter
;
2076 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
2079 res
= byteOutLen
/ sizeof( UniChar
) ;
2084 if ( buf
&& res
< n
)
2090 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2092 OSStatus status
= noErr
;
2093 ByteCount byteOutLen
;
2094 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2100 //apple specs say at least 32
2101 n
= wxMax( 32 , ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2102 tbuf
= (char*) malloc( n
) ;
2105 ByteCount byteBufferLen
= n
;
2106 UniChar
* ubuf
= NULL
;
2107 #if SIZEOF_WCHAR_T == 4
2108 wxMBConvUTF16BE converter
;
2109 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2110 byteInLen
= unicharlen
;
2111 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2112 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2114 ubuf
= (UniChar
*) psz
;
2116 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2117 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
2118 #if SIZEOF_WCHAR_T == 4
2124 size_t res
= byteOutLen
;
2125 if ( buf
&& res
< n
)
2129 //we need to double-trip to verify it didn't insert any ? in place
2130 //of bogus characters
2131 wxWCharBuffer
wcBuf(n
);
2132 size_t pszlen
= wxWcslen(psz
);
2133 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
2134 wxWcslen(wcBuf
) != pszlen
||
2135 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2137 // we didn't obtain the same thing we started from, hence
2138 // the conversion was lossy and we consider that it failed
2147 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2150 TECObjectRef m_MB2WC_converter
;
2151 TECObjectRef m_WC2MB_converter
;
2153 TextEncodingBase m_char_encoding
;
2154 TextEncodingBase m_unicode_encoding
;
2157 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2159 // ============================================================================
2160 // wxEncodingConverter based conversion classes
2161 // ============================================================================
2165 class wxMBConv_wxwin
: public wxMBConv
2170 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2171 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2175 // temporarily just use wxEncodingConverter stuff,
2176 // so that it works while a better implementation is built
2177 wxMBConv_wxwin(const wxChar
* name
)
2180 m_enc
= wxFontMapper::Get()->CharsetToEncoding(name
, false);
2182 m_enc
= wxFONTENCODING_SYSTEM
;
2187 wxMBConv_wxwin(wxFontEncoding enc
)
2194 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2196 size_t inbuf
= strlen(psz
);
2198 m2w
.Convert(psz
,buf
);
2202 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2204 const size_t inbuf
= wxWcslen(psz
);
2206 w2m
.Convert(psz
,buf
);
2211 bool IsOk() const { return m_ok
; }
2214 wxFontEncoding m_enc
;
2215 wxEncodingConverter m2w
, w2m
;
2217 // were we initialized successfully?
2220 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2223 #endif // wxUSE_FONTMAP
2225 // ============================================================================
2226 // wxCSConv implementation
2227 // ============================================================================
2229 void wxCSConv::Init()
2236 wxCSConv::wxCSConv(const wxChar
*charset
)
2245 m_encoding
= wxFONTENCODING_SYSTEM
;
2248 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2250 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2252 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2254 encoding
= wxFONTENCODING_SYSTEM
;
2259 m_encoding
= encoding
;
2262 wxCSConv::~wxCSConv()
2267 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2272 SetName(conv
.m_name
);
2273 m_encoding
= conv
.m_encoding
;
2276 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2280 SetName(conv
.m_name
);
2281 m_encoding
= conv
.m_encoding
;
2286 void wxCSConv::Clear()
2295 void wxCSConv::SetName(const wxChar
*charset
)
2299 m_name
= wxStrdup(charset
);
2304 wxMBConv
*wxCSConv::DoCreate() const
2306 // check for the special case of ASCII or ISO8859-1 charset: as we have
2307 // special knowledge of it anyhow, we don't need to create a special
2308 // conversion object
2309 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
2311 // don't convert at all
2315 // we trust OS to do conversion better than we can so try external
2316 // conversion methods first
2318 // the full order is:
2319 // 1. OS conversion (iconv() under Unix or Win32 API)
2320 // 2. hard coded conversions for UTF
2321 // 3. wxEncodingConverter as fall back
2327 #endif // !wxUSE_FONTMAP
2329 wxString
name(m_name
);
2333 name
= wxFontMapper::Get()->GetEncodingName(m_encoding
);
2334 #endif // wxUSE_FONTMAP
2336 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
2342 #endif // HAVE_ICONV
2344 #ifdef wxHAVE_WIN32_MB2WC
2347 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2348 : new wxMBConv_win32(m_encoding
);
2357 #endif // wxHAVE_WIN32_MB2WC
2358 #if defined(__WXMAC__)
2360 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
) )
2363 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
2364 : new wxMBConv_mac(m_encoding
);
2372 #if defined(__WXCOCOA__)
2374 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
2377 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
2378 : new wxMBConv_cocoa(m_encoding
);
2387 wxFontEncoding enc
= m_encoding
;
2389 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2391 // use "false" to suppress interactive dialogs -- we can be called from
2392 // anywhere and popping up a dialog from here is the last thing we want to
2394 enc
= wxFontMapper::Get()->CharsetToEncoding(m_name
, false);
2396 #endif // wxUSE_FONTMAP
2400 case wxFONTENCODING_UTF7
:
2401 return new wxMBConvUTF7
;
2403 case wxFONTENCODING_UTF8
:
2404 return new wxMBConvUTF8
;
2406 case wxFONTENCODING_UTF16BE
:
2407 return new wxMBConvUTF16BE
;
2409 case wxFONTENCODING_UTF16LE
:
2410 return new wxMBConvUTF16LE
;
2412 case wxFONTENCODING_UTF32BE
:
2413 return new wxMBConvUTF32BE
;
2415 case wxFONTENCODING_UTF32LE
:
2416 return new wxMBConvUTF32LE
;
2419 // nothing to do but put here to suppress gcc warnings
2426 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2427 : new wxMBConv_wxwin(m_encoding
);
2433 #endif // wxUSE_FONTMAP
2435 // NB: This is a hack to prevent deadlock. What could otherwise happen
2436 // in Unicode build: wxConvLocal creation ends up being here
2437 // because of some failure and logs the error. But wxLog will try to
2438 // attach timestamp, for which it will need wxConvLocal (to convert
2439 // time to char* and then wchar_t*), but that fails, tries to log
2440 // error, but wxLog has a (already locked) critical section that
2441 // guards static buffer.
2442 static bool alreadyLoggingError
= false;
2443 if (!alreadyLoggingError
)
2445 alreadyLoggingError
= true;
2446 wxLogError(_("Cannot convert from the charset '%s'!"),
2450 wxFontMapper::GetEncodingDescription(m_encoding
).c_str()
2451 #else // !wxUSE_FONTMAP
2452 wxString::Format(_("encoding %s"), m_encoding
).c_str()
2453 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2455 alreadyLoggingError
= false;
2461 void wxCSConv::CreateConvIfNeeded() const
2465 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
2468 // if we don't have neither the name nor the encoding, use the default
2469 // encoding for this system
2470 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
2472 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
2474 #endif // wxUSE_INTL
2476 self
->m_convReal
= DoCreate();
2477 self
->m_deferred
= false;
2481 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2483 CreateConvIfNeeded();
2486 return m_convReal
->MB2WC(buf
, psz
, n
);
2489 size_t len
= strlen(psz
);
2493 for (size_t c
= 0; c
<= len
; c
++)
2494 buf
[c
] = (unsigned char)(psz
[c
]);
2500 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2502 CreateConvIfNeeded();
2505 return m_convReal
->WC2MB(buf
, psz
, n
);
2508 const size_t len
= wxWcslen(psz
);
2511 for (size_t c
= 0; c
<= len
; c
++)
2515 buf
[c
] = (char)psz
[c
];
2520 for (size_t c
= 0; c
<= len
; c
++)
2530 // ----------------------------------------------------------------------------
2532 // ----------------------------------------------------------------------------
2535 static wxMBConv_win32 wxConvLibcObj
;
2536 #elif defined(__WXMAC__) && !defined(__MACH__)
2537 static wxMBConv_mac wxConvLibcObj
;
2539 static wxMBConvLibc wxConvLibcObj
;
2542 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
2543 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
2544 static wxMBConvUTF7 wxConvUTF7Obj
;
2545 static wxMBConvUTF8 wxConvUTF8Obj
;
2548 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
2549 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
2550 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
2551 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
2552 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
2553 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
2555 #else // !wxUSE_WCHAR_T
2557 // stand-ins in absence of wchar_t
2558 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
2563 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T