1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
13 // ============================================================================
15 // ============================================================================
17 // ----------------------------------------------------------------------------
19 // ----------------------------------------------------------------------------
21 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
22 #pragma implementation "strconv.h"
25 // For compilers that support precompilation, includes "wx.h".
26 #include "wx/wxprec.h"
37 #include "wx/strconv.h"
42 #include "wx/msw/private.h"
46 #include "wx/msw/missing.h"
57 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
58 #define wxHAVE_WIN32_MB2WC
59 #endif // __WIN32__ but !__WXMICROWIN__
61 // ----------------------------------------------------------------------------
63 // ----------------------------------------------------------------------------
73 #include "wx/encconv.h"
74 #include "wx/fontmap.h"
78 #include <ATSUnicode.h>
79 #include <TextCommon.h>
80 #include <TextEncodingConverter.h>
82 #include "wx/mac/private.h" // includes mac headers
84 // ----------------------------------------------------------------------------
86 // ----------------------------------------------------------------------------
88 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
89 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
91 #if SIZEOF_WCHAR_T == 4
92 #define WC_NAME "UCS4"
93 #define WC_BSWAP BSWAP_UCS4
94 #ifdef WORDS_BIGENDIAN
95 #define WC_NAME_BEST "UCS-4BE"
97 #define WC_NAME_BEST "UCS-4LE"
99 #elif SIZEOF_WCHAR_T == 2
100 #define WC_NAME "UTF16"
101 #define WC_BSWAP BSWAP_UTF16
103 #ifdef WORDS_BIGENDIAN
104 #define WC_NAME_BEST "UTF-16BE"
106 #define WC_NAME_BEST "UTF-16LE"
108 #else // sizeof(wchar_t) != 2 nor 4
109 // does this ever happen?
110 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
113 // ============================================================================
115 // ============================================================================
117 // ----------------------------------------------------------------------------
118 // UTF-16 en/decoding to/from UCS-4
119 // ----------------------------------------------------------------------------
122 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
127 *output
= (wxUint16
) input
;
130 else if (input
>=0x110000)
138 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
139 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
145 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
147 if ((*input
<0xd800) || (*input
>0xdfff))
152 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
159 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
165 // ----------------------------------------------------------------------------
167 // ----------------------------------------------------------------------------
169 wxMBConv::~wxMBConv()
171 // nothing to do here
174 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
178 // calculate the length of the buffer needed first
179 size_t nLen
= MB2WC(NULL
, psz
, 0);
180 if ( nLen
!= (size_t)-1 )
182 // now do the actual conversion
183 wxWCharBuffer
buf(nLen
);
184 nLen
= MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NULL
185 if ( nLen
!= (size_t)-1 )
192 wxWCharBuffer
buf((wchar_t *)NULL
);
197 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
201 size_t nLen
= WC2MB(NULL
, pwz
, 0);
202 if ( nLen
!= (size_t)-1 )
204 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
205 nLen
= WC2MB(buf
.data(), pwz
, nLen
+ 4);
206 if ( nLen
!= (size_t)-1 )
213 wxCharBuffer
buf((char *)NULL
);
218 // ----------------------------------------------------------------------------
220 // ----------------------------------------------------------------------------
222 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
224 return wxMB2WC(buf
, psz
, n
);
227 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
229 return wxWC2MB(buf
, psz
, n
);
232 // ----------------------------------------------------------------------------
234 // ----------------------------------------------------------------------------
237 static char utf7_setD
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
238 "abcdefghijklmnopqrstuvwxyz"
239 "0123456789'(),-./:?";
240 static char utf7_setO
[]="!\"#$%&*;<=>@[]^_`{|}";
241 static char utf7_setB
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
242 "abcdefghijklmnopqrstuvwxyz"
246 // TODO: write actual implementations of UTF-7 here
247 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf
),
248 const char * WXUNUSED(psz
),
249 size_t WXUNUSED(n
)) const
254 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf
),
255 const wchar_t * WXUNUSED(psz
),
256 size_t WXUNUSED(n
)) const
261 // ----------------------------------------------------------------------------
263 // ----------------------------------------------------------------------------
265 static wxUint32 utf8_max
[]=
266 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
268 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
272 while (*psz
&& ((!buf
) || (len
< n
)))
274 unsigned char cc
= *psz
++, fc
= cc
;
276 for (cnt
= 0; fc
& 0x80; cnt
++)
290 // invalid UTF-8 sequence
295 unsigned ocnt
= cnt
- 1;
296 wxUint32 res
= cc
& (0x3f >> cnt
);
300 if ((cc
& 0xC0) != 0x80)
302 // invalid UTF-8 sequence
305 res
= (res
<< 6) | (cc
& 0x3f);
307 if (res
<= utf8_max
[ocnt
])
309 // illegal UTF-8 encoding
313 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
314 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
315 if (pa
== (size_t)-1)
324 #endif // WC_UTF16/!WC_UTF16
328 if (buf
&& (len
< n
))
333 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
337 while (*psz
&& ((!buf
) || (len
< n
)))
341 // cast is ok for WC_UTF16
342 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
343 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
345 cc
=(*psz
++) & 0x7fffffff;
348 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
362 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
364 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
369 if (buf
&& (len
<n
)) *buf
= 0;
377 // ----------------------------------------------------------------------------
379 // ----------------------------------------------------------------------------
381 #ifdef WORDS_BIGENDIAN
382 #define wxMBConvUTF16straight wxMBConvUTF16BE
383 #define wxMBConvUTF16swap wxMBConvUTF16LE
385 #define wxMBConvUTF16swap wxMBConvUTF16BE
386 #define wxMBConvUTF16straight wxMBConvUTF16LE
392 // copy 16bit MB to 16bit String
393 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
397 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
400 *buf
++ = *(wxUint16
*)psz
;
403 psz
+= sizeof(wxUint16
);
405 if (buf
&& len
<n
) *buf
=0;
411 // copy 16bit String to 16bit MB
412 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
416 while (*psz
&& (!buf
|| len
< n
))
420 *(wxUint16
*)buf
= *psz
;
421 buf
+= sizeof(wxUint16
);
423 len
+= sizeof(wxUint16
);
426 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
432 // swap 16bit MB to 16bit String
433 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
437 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
441 ((char *)buf
)[0] = psz
[1];
442 ((char *)buf
)[1] = psz
[0];
446 psz
+= sizeof(wxUint16
);
448 if (buf
&& len
<n
) *buf
=0;
454 // swap 16bit MB to 16bit String
455 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
459 while (*psz
&& (!buf
|| len
< n
))
463 *buf
++ = ((char*)psz
)[1];
464 *buf
++ = ((char*)psz
)[0];
466 len
+= sizeof(wxUint16
);
469 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
478 // copy 16bit MB to 32bit String
479 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
483 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
486 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
487 if (pa
== (size_t)-1)
493 psz
+= pa
* sizeof(wxUint16
);
495 if (buf
&& len
<n
) *buf
=0;
501 // copy 32bit String to 16bit MB
502 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
506 while (*psz
&& (!buf
|| len
< n
))
509 size_t pa
=encode_utf16(*psz
, cc
);
511 if (pa
== (size_t)-1)
516 *(wxUint16
*)buf
= cc
[0];
517 buf
+= sizeof(wxUint16
);
520 *(wxUint16
*)buf
= cc
[1];
521 buf
+= sizeof(wxUint16
);
525 len
+= pa
*sizeof(wxUint16
);
528 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
534 // swap 16bit MB to 32bit String
535 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
539 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
543 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
544 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
546 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
547 if (pa
== (size_t)-1)
554 psz
+= pa
* sizeof(wxUint16
);
556 if (buf
&& len
<n
) *buf
=0;
562 // swap 32bit String to 16bit MB
563 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
567 while (*psz
&& (!buf
|| len
< n
))
570 size_t pa
=encode_utf16(*psz
, cc
);
572 if (pa
== (size_t)-1)
577 *buf
++ = ((char*)cc
)[1];
578 *buf
++ = ((char*)cc
)[0];
581 *buf
++ = ((char*)cc
)[3];
582 *buf
++ = ((char*)cc
)[2];
586 len
+= pa
*sizeof(wxUint16
);
589 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
597 // ----------------------------------------------------------------------------
599 // ----------------------------------------------------------------------------
601 #ifdef WORDS_BIGENDIAN
602 #define wxMBConvUTF32straight wxMBConvUTF32BE
603 #define wxMBConvUTF32swap wxMBConvUTF32LE
605 #define wxMBConvUTF32swap wxMBConvUTF32BE
606 #define wxMBConvUTF32straight wxMBConvUTF32LE
610 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
611 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
616 // copy 32bit MB to 16bit String
617 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
621 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
625 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
626 if (pa
== (size_t)-1)
636 psz
+= sizeof(wxUint32
);
638 if (buf
&& len
<n
) *buf
=0;
644 // copy 16bit String to 32bit MB
645 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
649 while (*psz
&& (!buf
|| len
< n
))
653 // cast is ok for WC_UTF16
654 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
655 if (pa
== (size_t)-1)
660 *(wxUint32
*)buf
= cc
;
661 buf
+= sizeof(wxUint32
);
663 len
+= sizeof(wxUint32
);
667 if (buf
&& len
<=n
-sizeof(wxUint32
))
675 // swap 32bit MB to 16bit String
676 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
680 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
683 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
684 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
689 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
690 if (pa
== (size_t)-1)
700 psz
+= sizeof(wxUint32
);
710 // swap 16bit String to 32bit MB
711 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
715 while (*psz
&& (!buf
|| len
< n
))
719 // cast is ok for WC_UTF16
720 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
721 if (pa
== (size_t)-1)
731 len
+= sizeof(wxUint32
);
735 if (buf
&& len
<=n
-sizeof(wxUint32
))
744 // copy 32bit MB to 32bit String
745 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
749 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
752 *buf
++ = *(wxUint32
*)psz
;
754 psz
+= sizeof(wxUint32
);
764 // copy 32bit String to 32bit MB
765 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
769 while (*psz
&& (!buf
|| len
< n
))
773 *(wxUint32
*)buf
= *psz
;
774 buf
+= sizeof(wxUint32
);
777 len
+= sizeof(wxUint32
);
781 if (buf
&& len
<=n
-sizeof(wxUint32
))
788 // swap 32bit MB to 32bit String
789 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
793 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
797 ((char *)buf
)[0] = psz
[3];
798 ((char *)buf
)[1] = psz
[2];
799 ((char *)buf
)[2] = psz
[1];
800 ((char *)buf
)[3] = psz
[0];
804 psz
+= sizeof(wxUint32
);
814 // swap 32bit String to 32bit MB
815 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
819 while (*psz
&& (!buf
|| len
< n
))
823 *buf
++ = ((char *)psz
)[3];
824 *buf
++ = ((char *)psz
)[2];
825 *buf
++ = ((char *)psz
)[1];
826 *buf
++ = ((char *)psz
)[0];
828 len
+= sizeof(wxUint32
);
832 if (buf
&& len
<=n
-sizeof(wxUint32
))
842 // ============================================================================
843 // The classes doing conversion using the iconv_xxx() functions
844 // ============================================================================
848 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
849 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
850 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
851 // (which means error) and says there are 0 bytes left in the input buffer --
852 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
853 // this alternative test for iconv() failure.
854 // [This bug does not appear in glibc 2.2.]
855 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
856 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
857 (errno != E2BIG || bufLeft != 0))
859 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
862 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
864 // ----------------------------------------------------------------------------
865 // wxMBConv_iconv: encapsulates an iconv character set
866 // ----------------------------------------------------------------------------
868 class wxMBConv_iconv
: public wxMBConv
871 wxMBConv_iconv(const wxChar
*name
);
872 virtual ~wxMBConv_iconv();
874 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
875 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
878 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
881 // the iconv handlers used to translate from multibyte to wide char and in
882 // the other direction
887 // the name (for iconv_open()) of a wide char charset -- if none is
888 // available on this machine, it will remain NULL
889 static const char *ms_wcCharsetName
;
891 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
892 // different endian-ness than the native one
893 static bool ms_wcNeedsSwap
;
896 const char *wxMBConv_iconv::ms_wcCharsetName
= NULL
;
897 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
899 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
901 // Do it the hard way
903 for (size_t i
= 0; i
< wxStrlen(name
)+1; i
++)
904 cname
[i
] = (char) name
[i
];
906 // check for charset that represents wchar_t:
907 if (ms_wcCharsetName
== NULL
)
909 ms_wcNeedsSwap
= false;
911 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
912 ms_wcCharsetName
= WC_NAME_BEST
;
913 m2w
= iconv_open(ms_wcCharsetName
, cname
);
915 if (m2w
== (iconv_t
)-1)
917 // try charset w/o bytesex info (e.g. "UCS4")
918 // and check for bytesex ourselves:
919 ms_wcCharsetName
= WC_NAME
;
920 m2w
= iconv_open(ms_wcCharsetName
, cname
);
922 // last bet, try if it knows WCHAR_T pseudo-charset
923 if (m2w
== (iconv_t
)-1)
925 ms_wcCharsetName
= "WCHAR_T";
926 m2w
= iconv_open(ms_wcCharsetName
, cname
);
929 if (m2w
!= (iconv_t
)-1)
931 char buf
[2], *bufPtr
;
932 wchar_t wbuf
[2], *wbufPtr
;
940 outsz
= SIZEOF_WCHAR_T
* 2;
944 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
945 (char**)&wbufPtr
, &outsz
);
947 if (ICONV_FAILED(res
, insz
))
949 ms_wcCharsetName
= NULL
;
950 wxLogLastError(wxT("iconv"));
951 wxLogError(_("Conversion to charset '%s' doesn't work."), name
);
955 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
960 ms_wcCharsetName
= NULL
;
962 // VS: we must not output an error here, since wxWidgets will safely
963 // fall back to using wxEncodingConverter.
964 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name
);
968 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName
, ms_wcNeedsSwap
);
970 else // we already have ms_wcCharsetName
972 m2w
= iconv_open(ms_wcCharsetName
, cname
);
975 // NB: don't ever pass NULL to iconv_open(), it may crash!
976 if ( ms_wcCharsetName
)
978 w2m
= iconv_open( cname
, ms_wcCharsetName
);
986 wxMBConv_iconv::~wxMBConv_iconv()
988 if ( m2w
!= (iconv_t
)-1 )
990 if ( w2m
!= (iconv_t
)-1 )
994 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
996 size_t inbuf
= strlen(psz
);
997 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
999 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1000 wchar_t *bufPtr
= buf
;
1001 const char *pszPtr
= psz
;
1005 // have destination buffer, convert there
1007 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1008 (char**)&bufPtr
, &outbuf
);
1009 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1013 // convert to native endianness
1014 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
1017 // NB: iconv was given only strlen(psz) characters on input, and so
1018 // it couldn't convert the trailing zero. Let's do it ourselves
1019 // if there's some room left for it in the output buffer.
1025 // no destination buffer... convert using temp buffer
1026 // to calculate destination buffer requirement
1031 outbuf
= 8*SIZEOF_WCHAR_T
;
1034 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1035 (char**)&bufPtr
, &outbuf
);
1037 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1038 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1041 if (ICONV_FAILED(cres
, inbuf
))
1043 //VS: it is ok if iconv fails, hence trace only
1044 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1051 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1053 size_t inbuf
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1057 wchar_t *tmpbuf
= 0;
1061 // need to copy to temp buffer to switch endianness
1062 // this absolutely doesn't rock!
1063 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1064 // could be in read-only memory, or be accessed in some other thread)
1065 tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
1066 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
1067 WC_BSWAP(tmpbuf
, inbuf
)
1073 // have destination buffer, convert there
1074 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1078 // NB: iconv was given only wcslen(psz) characters on input, and so
1079 // it couldn't convert the trailing zero. Let's do it ourselves
1080 // if there's some room left for it in the output buffer.
1086 // no destination buffer... convert using temp buffer
1087 // to calculate destination buffer requirement
1091 buf
= tbuf
; outbuf
= 16;
1093 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1096 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1104 if (ICONV_FAILED(cres
, inbuf
))
1106 //VS: it is ok if iconv fails, hence trace only
1107 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1114 #endif // HAVE_ICONV
1117 // ============================================================================
1118 // Win32 conversion classes
1119 // ============================================================================
1121 #ifdef wxHAVE_WIN32_MB2WC
1125 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1126 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1129 class wxMBConv_win32
: public wxMBConv
1134 m_CodePage
= CP_ACP
;
1138 wxMBConv_win32(const wxChar
* name
)
1140 m_CodePage
= wxCharsetToCodepage(name
);
1143 wxMBConv_win32(wxFontEncoding encoding
)
1145 m_CodePage
= wxEncodingToCodepage(encoding
);
1149 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1151 const size_t len
= ::MultiByteToWideChar
1153 m_CodePage
, // code page
1155 psz
, // input string
1156 -1, // its length (NUL-terminated)
1157 buf
, // output string
1158 buf
? n
: 0 // size of output buffer
1161 // note that it returns count of written chars for buf != NULL and size
1162 // of the needed buffer for buf == NULL so in either case the length of
1163 // the string (which never includes the terminating NUL) is one less
1164 return len
? len
- 1 : (size_t)-1;
1167 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1170 we have a problem here: by default, WideCharToMultiByte() may
1171 replace characters unrepresentable in the target code page with bad
1172 quality approximations such as turning "1/2" symbol (U+00BD) into
1173 "1" for the code pages which don't have it and we, obviously, want
1174 to avoid this at any price
1176 the trouble is that this function does it _silently_, i.e. it won't
1177 even tell us whether it did or not... Win98/2000 and higher provide
1178 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1179 we have to resort to a round trip, i.e. check that converting back
1180 results in the same string -- this is, of course, expensive but
1181 otherwise we simply can't be sure to not garble the data.
1184 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1185 // it doesn't work with CJK encodings (which we test for rather roughly
1186 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1188 BOOL usedDef
wxDUMMY_INITIALIZE(false),
1191 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1193 // it's our lucky day
1194 flags
= WC_NO_BEST_FIT_CHARS
;
1195 pUsedDef
= &usedDef
;
1197 else // old system or unsupported encoding
1203 const size_t len
= ::WideCharToMultiByte
1205 m_CodePage
, // code page
1206 flags
, // either none or no best fit
1207 pwz
, // input string
1208 -1, // it is (wide) NUL-terminated
1209 buf
, // output buffer
1210 buf
? n
: 0, // and its size
1211 NULL
, // default "replacement" char
1212 pUsedDef
// [out] was it used?
1217 // function totally failed
1221 // if we were really converting, check if we succeeded
1226 // check if the conversion failed, i.e. if any replacements
1231 else // we must resort to double tripping...
1233 wxWCharBuffer
wcBuf(n
);
1234 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1235 wcscmp(wcBuf
, pwz
) != 0 )
1237 // we didn't obtain the same thing we started from, hence
1238 // the conversion was lossy and we consider that it failed
1244 // see the comment above for the reason of "len - 1"
1248 bool IsOk() const { return m_CodePage
!= -1; }
1251 static bool CanUseNoBestFit()
1253 static int s_isWin98Or2k
= -1;
1255 if ( s_isWin98Or2k
== -1 )
1258 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1261 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1265 s_isWin98Or2k
= verMaj
>= 5;
1269 // unknown, be conseravtive by default
1273 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1276 return s_isWin98Or2k
== 1;
1282 #endif // wxHAVE_WIN32_MB2WC
1284 // ============================================================================
1285 // Mac conversion classes
1286 // ============================================================================
1288 #if defined(__WXMAC__) && defined(TARGET_CARBON)
1290 class wxMBConv_mac
: public wxMBConv
1295 Init(CFStringGetSystemEncoding()) ;
1298 wxMBConv_mac(const wxChar
* name
)
1300 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, FALSE
) ) ) ;
1303 wxMBConv_mac(wxFontEncoding encoding
)
1305 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
1310 OSStatus status
= noErr
;
1311 status
= TECDisposeConverter(m_MB2WC_converter
);
1312 status
= TECDisposeConverter(m_WC2MB_converter
);
1316 void Init( TextEncodingBase encoding
)
1318 OSStatus status
= noErr
;
1319 m_char_encoding
= encoding
;
1320 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
1322 status
= TECCreateConverter(&m_MB2WC_converter
,
1324 m_unicode_encoding
);
1325 status
= TECCreateConverter(&m_WC2MB_converter
,
1330 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1332 OSStatus status
= noErr
;
1333 ByteCount byteOutLen
;
1334 ByteCount byteInLen
= strlen(psz
) ;
1335 wchar_t *tbuf
= NULL
;
1336 UniChar
* ubuf
= NULL
;
1342 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
1344 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
1345 #if SIZEOF_WCHAR_T == 4
1346 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
1348 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
1350 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
1351 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
1352 #if SIZEOF_WCHAR_T == 4
1353 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
1354 // is not properly terminated we get random characters at the end
1355 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
1356 wxMBConvUTF16BE converter
;
1357 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
1360 res
= byteOutLen
/ sizeof( UniChar
) ;
1365 if ( buf
&& res
< n
)
1371 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1373 OSStatus status
= noErr
;
1374 ByteCount byteOutLen
;
1375 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1383 tbuf
= (char*) malloc( n
) ;
1386 ByteCount byteBufferLen
= n
;
1387 UniChar
* ubuf
= NULL
;
1388 #if SIZEOF_WCHAR_T == 4
1389 wxMBConvUTF16BE converter
;
1390 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
1391 byteInLen
= unicharlen
;
1392 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
1393 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
1395 ubuf
= (UniChar
*) psz
;
1397 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
1398 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
1399 #if SIZEOF_WCHAR_T == 4
1405 size_t res
= byteOutLen
;
1406 if ( buf
&& res
< n
)
1413 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
1416 TECObjectRef m_MB2WC_converter
;
1417 TECObjectRef m_WC2MB_converter
;
1419 TextEncodingBase m_char_encoding
;
1420 TextEncodingBase m_unicode_encoding
;
1423 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1425 // ============================================================================
1426 // wxEncodingConverter based conversion classes
1427 // ============================================================================
1431 class wxMBConv_wxwin
: public wxMBConv
1436 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
1437 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
1441 // temporarily just use wxEncodingConverter stuff,
1442 // so that it works while a better implementation is built
1443 wxMBConv_wxwin(const wxChar
* name
)
1446 m_enc
= wxFontMapper::Get()->CharsetToEncoding(name
, false);
1448 m_enc
= wxFONTENCODING_SYSTEM
;
1453 wxMBConv_wxwin(wxFontEncoding enc
)
1460 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
1462 size_t inbuf
= strlen(psz
);
1464 m2w
.Convert(psz
,buf
);
1468 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
1470 const size_t inbuf
= wxWcslen(psz
);
1472 w2m
.Convert(psz
,buf
);
1477 bool IsOk() const { return m_ok
; }
1480 wxFontEncoding m_enc
;
1481 wxEncodingConverter m2w
, w2m
;
1483 // were we initialized successfully?
1486 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
1489 #endif // wxUSE_FONTMAP
1491 // ============================================================================
1492 // wxCSConv implementation
1493 // ============================================================================
1495 void wxCSConv::Init()
1502 wxCSConv::wxCSConv(const wxChar
*charset
)
1511 m_encoding
= wxFONTENCODING_SYSTEM
;
1514 wxCSConv::wxCSConv(wxFontEncoding encoding
)
1516 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
1518 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
1520 encoding
= wxFONTENCODING_SYSTEM
;
1525 m_encoding
= encoding
;
1528 wxCSConv::~wxCSConv()
1533 wxCSConv::wxCSConv(const wxCSConv
& conv
)
1538 SetName(conv
.m_name
);
1539 m_encoding
= conv
.m_encoding
;
1542 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
1546 SetName(conv
.m_name
);
1547 m_encoding
= conv
.m_encoding
;
1552 void wxCSConv::Clear()
1561 void wxCSConv::SetName(const wxChar
*charset
)
1565 m_name
= wxStrdup(charset
);
1570 wxMBConv
*wxCSConv::DoCreate() const
1572 // check for the special case of ASCII or ISO8859-1 charset: as we have
1573 // special knowledge of it anyhow, we don't need to create a special
1574 // conversion object
1575 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
1577 // don't convert at all
1581 // we trust OS to do conversion better than we can so try external
1582 // conversion methods first
1584 // the full order is:
1585 // 1. OS conversion (iconv() under Unix or Win32 API)
1586 // 2. hard coded conversions for UTF
1587 // 3. wxEncodingConverter as fall back
1593 #endif // !wxUSE_FONTMAP
1595 wxString
name(m_name
);
1599 name
= wxFontMapper::Get()->GetEncodingName(m_encoding
);
1600 #endif // wxUSE_FONTMAP
1602 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
1608 #endif // HAVE_ICONV
1610 #ifdef wxHAVE_WIN32_MB2WC
1613 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
1614 : new wxMBConv_win32(m_encoding
);
1623 #endif // wxHAVE_WIN32_MB2WC
1624 #if defined(__WXMAC__)
1626 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
) )
1629 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
1630 : new wxMBConv_mac(m_encoding
);
1639 wxFontEncoding enc
= m_encoding
;
1641 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
1643 // use "false" to suppress interactive dialogs -- we can be called from
1644 // anywhere and popping up a dialog from here is the last thing we want to
1646 enc
= wxFontMapper::Get()->CharsetToEncoding(m_name
, false);
1648 #endif // wxUSE_FONTMAP
1652 case wxFONTENCODING_UTF7
:
1653 return new wxMBConvUTF7
;
1655 case wxFONTENCODING_UTF8
:
1656 return new wxMBConvUTF8
;
1658 case wxFONTENCODING_UTF16BE
:
1659 return new wxMBConvUTF16BE
;
1661 case wxFONTENCODING_UTF16LE
:
1662 return new wxMBConvUTF16LE
;
1664 case wxFONTENCODING_UTF32BE
:
1665 return new wxMBConvUTF32BE
;
1667 case wxFONTENCODING_UTF32LE
:
1668 return new wxMBConvUTF32LE
;
1671 // nothing to do but put here to suppress gcc warnings
1678 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
1679 : new wxMBConv_wxwin(m_encoding
);
1685 #endif // wxUSE_FONTMAP
1687 // NB: This is a hack to prevent deadlock. What could otherwise happen
1688 // in Unicode build: wxConvLocal creation ends up being here
1689 // because of some failure and logs the error. But wxLog will try to
1690 // attach timestamp, for which it will need wxConvLocal (to convert
1691 // time to char* and then wchar_t*), but that fails, tries to log
1692 // error, but wxLog has a (already locked) critical section that
1693 // guards static buffer.
1694 static bool alreadyLoggingError
= false;
1695 if (!alreadyLoggingError
)
1697 alreadyLoggingError
= true;
1698 wxLogError(_("Cannot convert from the charset '%s'!"),
1702 wxFontMapper::GetEncodingDescription(m_encoding
).c_str()
1703 #else // !wxUSE_FONTMAP
1704 wxString::Format(_("encoding %s"), m_encoding
).c_str()
1705 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1707 alreadyLoggingError
= false;
1713 void wxCSConv::CreateConvIfNeeded() const
1717 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
1720 // if we don't have neither the name nor the encoding, use the default
1721 // encoding for this system
1722 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
1724 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
1726 #endif // wxUSE_INTL
1728 self
->m_convReal
= DoCreate();
1729 self
->m_deferred
= false;
1733 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1735 CreateConvIfNeeded();
1738 return m_convReal
->MB2WC(buf
, psz
, n
);
1741 size_t len
= strlen(psz
);
1745 for (size_t c
= 0; c
<= len
; c
++)
1746 buf
[c
] = (unsigned char)(psz
[c
]);
1752 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1754 CreateConvIfNeeded();
1757 return m_convReal
->WC2MB(buf
, psz
, n
);
1760 const size_t len
= wxWcslen(psz
);
1763 for (size_t c
= 0; c
<= len
; c
++)
1772 for (size_t c
= 0; c
<= len
; c
++)
1782 // ----------------------------------------------------------------------------
1784 // ----------------------------------------------------------------------------
1787 static wxMBConv_win32 wxConvLibcObj
;
1789 static wxMBConvLibc wxConvLibcObj
;
1792 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
1793 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
1794 static wxMBConvUTF7 wxConvUTF7Obj
;
1795 static wxMBConvUTF8 wxConvUTF8Obj
;
1798 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
1799 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
1800 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
1801 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
1802 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
1803 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
1805 #else // !wxUSE_WCHAR_T
1807 // stand-ins in absence of wchar_t
1808 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
1813 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T