1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
13 // ============================================================================
15 // ============================================================================
17 // ----------------------------------------------------------------------------
19 // ----------------------------------------------------------------------------
21 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
22 #pragma implementation "strconv.h"
25 // For compilers that support precompilation, includes "wx.h".
26 #include "wx/wxprec.h"
37 #include "wx/strconv.h"
42 #include "wx/msw/private.h"
46 #include "wx/msw/missing.h"
57 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
58 #define wxHAVE_WIN32_MB2WC
59 #endif // __WIN32__ but !__WXMICROWIN__
61 // ----------------------------------------------------------------------------
63 // ----------------------------------------------------------------------------
73 #include "wx/encconv.h"
74 #include "wx/fontmap.h"
78 #include <ATSUnicode.h>
79 #include <TextCommon.h>
80 #include <TextEncodingConverter.h>
82 #include "wx/mac/private.h" // includes mac headers
84 // ----------------------------------------------------------------------------
86 // ----------------------------------------------------------------------------
88 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
89 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
91 #if SIZEOF_WCHAR_T == 4
92 #define WC_NAME "UCS4"
93 #define WC_BSWAP BSWAP_UCS4
94 #ifdef WORDS_BIGENDIAN
95 #define WC_NAME_BEST "UCS-4BE"
97 #define WC_NAME_BEST "UCS-4LE"
99 #elif SIZEOF_WCHAR_T == 2
100 #define WC_NAME "UTF16"
101 #define WC_BSWAP BSWAP_UTF16
103 #ifdef WORDS_BIGENDIAN
104 #define WC_NAME_BEST "UTF-16BE"
106 #define WC_NAME_BEST "UTF-16LE"
108 #else // sizeof(wchar_t) != 2 nor 4
109 // does this ever happen?
110 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
113 // ============================================================================
115 // ============================================================================
117 // ----------------------------------------------------------------------------
118 // UTF-16 en/decoding to/from UCS-4
119 // ----------------------------------------------------------------------------
122 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
127 *output
= (wxUint16
) input
;
130 else if (input
>=0x110000)
138 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
139 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
145 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
147 if ((*input
<0xd800) || (*input
>0xdfff))
152 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
159 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
165 // ----------------------------------------------------------------------------
167 // ----------------------------------------------------------------------------
169 wxMBConv::~wxMBConv()
171 // nothing to do here
174 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
178 // calculate the length of the buffer needed first
179 size_t nLen
= MB2WC(NULL
, psz
, 0);
180 if ( nLen
!= (size_t)-1 )
182 // now do the actual conversion
183 wxWCharBuffer
buf(nLen
);
184 nLen
= MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NULL
185 if ( nLen
!= (size_t)-1 )
192 wxWCharBuffer
buf((wchar_t *)NULL
);
197 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
201 size_t nLen
= WC2MB(NULL
, pwz
, 0);
202 if ( nLen
!= (size_t)-1 )
204 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
205 nLen
= WC2MB(buf
.data(), pwz
, nLen
+ 4);
206 if ( nLen
!= (size_t)-1 )
213 wxCharBuffer
buf((char *)NULL
);
218 // ----------------------------------------------------------------------------
220 // ----------------------------------------------------------------------------
222 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
224 return wxMB2WC(buf
, psz
, n
);
227 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
229 return wxWC2MB(buf
, psz
, n
);
232 // ----------------------------------------------------------------------------
234 // ----------------------------------------------------------------------------
237 static char utf7_setD
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
238 "abcdefghijklmnopqrstuvwxyz"
239 "0123456789'(),-./:?";
240 static char utf7_setO
[]="!\"#$%&*;<=>@[]^_`{|}";
241 static char utf7_setB
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
242 "abcdefghijklmnopqrstuvwxyz"
246 // TODO: write actual implementations of UTF-7 here
247 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf
),
248 const char * WXUNUSED(psz
),
249 size_t WXUNUSED(n
)) const
254 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf
),
255 const wchar_t * WXUNUSED(psz
),
256 size_t WXUNUSED(n
)) const
261 // ----------------------------------------------------------------------------
263 // ----------------------------------------------------------------------------
265 static wxUint32 utf8_max
[]=
266 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
268 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
272 while (*psz
&& ((!buf
) || (len
< n
)))
274 unsigned char cc
= *psz
++, fc
= cc
;
276 for (cnt
= 0; fc
& 0x80; cnt
++)
290 // invalid UTF-8 sequence
295 unsigned ocnt
= cnt
- 1;
296 wxUint32 res
= cc
& (0x3f >> cnt
);
300 if ((cc
& 0xC0) != 0x80)
302 // invalid UTF-8 sequence
305 res
= (res
<< 6) | (cc
& 0x3f);
307 if (res
<= utf8_max
[ocnt
])
309 // illegal UTF-8 encoding
313 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
314 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
315 if (pa
== (size_t)-1)
324 #endif // WC_UTF16/!WC_UTF16
328 if (buf
&& (len
< n
))
333 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
337 while (*psz
&& ((!buf
) || (len
< n
)))
341 // cast is ok for WC_UTF16
342 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
343 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
345 cc
=(*psz
++) & 0x7fffffff;
348 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
362 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
364 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
369 if (buf
&& (len
<n
)) *buf
= 0;
377 // ----------------------------------------------------------------------------
379 // ----------------------------------------------------------------------------
381 #ifdef WORDS_BIGENDIAN
382 #define wxMBConvUTF16straight wxMBConvUTF16BE
383 #define wxMBConvUTF16swap wxMBConvUTF16LE
385 #define wxMBConvUTF16swap wxMBConvUTF16BE
386 #define wxMBConvUTF16straight wxMBConvUTF16LE
392 // copy 16bit MB to 16bit String
393 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
397 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
400 *buf
++ = *(wxUint16
*)psz
;
403 psz
+= sizeof(wxUint16
);
405 if (buf
&& len
<n
) *buf
=0;
411 // copy 16bit String to 16bit MB
412 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
416 while (*psz
&& (!buf
|| len
< n
))
420 *(wxUint16
*)buf
= *psz
;
421 buf
+= sizeof(wxUint16
);
423 len
+= sizeof(wxUint16
);
426 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
432 // swap 16bit MB to 16bit String
433 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
437 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
441 ((char *)buf
)[0] = psz
[1];
442 ((char *)buf
)[1] = psz
[0];
446 psz
+= sizeof(wxUint16
);
448 if (buf
&& len
<n
) *buf
=0;
454 // swap 16bit MB to 16bit String
455 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
459 while (*psz
&& (!buf
|| len
< n
))
463 *buf
++ = ((char*)psz
)[1];
464 *buf
++ = ((char*)psz
)[0];
466 len
+= sizeof(wxUint16
);
469 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
478 // copy 16bit MB to 32bit String
479 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
483 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
486 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
487 if (pa
== (size_t)-1)
493 psz
+= pa
* sizeof(wxUint16
);
495 if (buf
&& len
<n
) *buf
=0;
501 // copy 32bit String to 16bit MB
502 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
506 while (*psz
&& (!buf
|| len
< n
))
509 size_t pa
=encode_utf16(*psz
, cc
);
511 if (pa
== (size_t)-1)
516 *(wxUint16
*)buf
= cc
[0];
517 buf
+= sizeof(wxUint16
);
520 *(wxUint16
*)buf
= cc
[1];
521 buf
+= sizeof(wxUint16
);
525 len
+= pa
*sizeof(wxUint16
);
528 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
534 // swap 16bit MB to 32bit String
535 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
539 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
543 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
544 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
546 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
547 if (pa
== (size_t)-1)
554 psz
+= pa
* sizeof(wxUint16
);
556 if (buf
&& len
<n
) *buf
=0;
562 // swap 32bit String to 16bit MB
563 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
567 while (*psz
&& (!buf
|| len
< n
))
570 size_t pa
=encode_utf16(*psz
, cc
);
572 if (pa
== (size_t)-1)
577 *buf
++ = ((char*)cc
)[1];
578 *buf
++ = ((char*)cc
)[0];
581 *buf
++ = ((char*)cc
)[3];
582 *buf
++ = ((char*)cc
)[2];
586 len
+= pa
*sizeof(wxUint16
);
589 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
597 // ----------------------------------------------------------------------------
599 // ----------------------------------------------------------------------------
601 #ifdef WORDS_BIGENDIAN
602 #define wxMBConvUTF32straight wxMBConvUTF32BE
603 #define wxMBConvUTF32swap wxMBConvUTF32LE
605 #define wxMBConvUTF32swap wxMBConvUTF32BE
606 #define wxMBConvUTF32straight wxMBConvUTF32LE
610 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
611 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
616 // copy 32bit MB to 16bit String
617 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
621 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
625 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
626 if (pa
== (size_t)-1)
636 psz
+= sizeof(wxUint32
);
638 if (buf
&& len
<n
) *buf
=0;
644 // copy 16bit String to 32bit MB
645 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
649 while (*psz
&& (!buf
|| len
< n
))
653 // cast is ok for WC_UTF16
654 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
655 if (pa
== (size_t)-1)
660 *(wxUint32
*)buf
= cc
;
661 buf
+= sizeof(wxUint32
);
663 len
+= sizeof(wxUint32
);
667 if (buf
&& len
<=n
-sizeof(wxUint32
))
675 // swap 32bit MB to 16bit String
676 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
680 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
683 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
684 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
689 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
690 if (pa
== (size_t)-1)
700 psz
+= sizeof(wxUint32
);
710 // swap 16bit String to 32bit MB
711 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
715 while (*psz
&& (!buf
|| len
< n
))
719 // cast is ok for WC_UTF16
720 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
721 if (pa
== (size_t)-1)
731 len
+= sizeof(wxUint32
);
735 if (buf
&& len
<=n
-sizeof(wxUint32
))
744 // copy 32bit MB to 32bit String
745 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
749 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
752 *buf
++ = *(wxUint32
*)psz
;
754 psz
+= sizeof(wxUint32
);
764 // copy 32bit String to 32bit MB
765 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
769 while (*psz
&& (!buf
|| len
< n
))
773 *(wxUint32
*)buf
= *psz
;
774 buf
+= sizeof(wxUint32
);
777 len
+= sizeof(wxUint32
);
781 if (buf
&& len
<=n
-sizeof(wxUint32
))
788 // swap 32bit MB to 32bit String
789 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
793 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
797 ((char *)buf
)[0] = psz
[3];
798 ((char *)buf
)[1] = psz
[2];
799 ((char *)buf
)[2] = psz
[1];
800 ((char *)buf
)[3] = psz
[0];
804 psz
+= sizeof(wxUint32
);
814 // swap 32bit String to 32bit MB
815 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
819 while (*psz
&& (!buf
|| len
< n
))
823 *buf
++ = ((char *)psz
)[3];
824 *buf
++ = ((char *)psz
)[2];
825 *buf
++ = ((char *)psz
)[1];
826 *buf
++ = ((char *)psz
)[0];
828 len
+= sizeof(wxUint32
);
832 if (buf
&& len
<=n
-sizeof(wxUint32
))
842 // ============================================================================
843 // The classes doing conversion using the iconv_xxx() functions
844 // ============================================================================
848 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
849 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
850 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
851 // (which means error) and says there are 0 bytes left in the input buffer --
852 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
853 // this alternative test for iconv() failure.
854 // [This bug does not appear in glibc 2.2.]
855 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
856 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
857 (errno != E2BIG || bufLeft != 0))
859 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
862 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
864 // ----------------------------------------------------------------------------
865 // wxMBConv_iconv: encapsulates an iconv character set
866 // ----------------------------------------------------------------------------
868 class wxMBConv_iconv
: public wxMBConv
871 wxMBConv_iconv(const wxChar
*name
);
872 virtual ~wxMBConv_iconv();
874 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
875 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
878 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
881 // the iconv handlers used to translate from multibyte to wide char and in
882 // the other direction
887 // the name (for iconv_open()) of a wide char charset -- if none is
888 // available on this machine, it will remain NULL
889 static const char *ms_wcCharsetName
;
891 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
892 // different endian-ness than the native one
893 static bool ms_wcNeedsSwap
;
896 const char *wxMBConv_iconv::ms_wcCharsetName
= NULL
;
897 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
899 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
901 // Do it the hard way
903 for (size_t i
= 0; i
< wxStrlen(name
)+1; i
++)
904 cname
[i
] = (char) name
[i
];
906 // check for charset that represents wchar_t:
907 if (ms_wcCharsetName
== NULL
)
909 ms_wcNeedsSwap
= false;
911 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
912 ms_wcCharsetName
= WC_NAME_BEST
;
913 m2w
= iconv_open(ms_wcCharsetName
, cname
);
915 if (m2w
== (iconv_t
)-1)
917 // try charset w/o bytesex info (e.g. "UCS4")
918 // and check for bytesex ourselves:
919 ms_wcCharsetName
= WC_NAME
;
920 m2w
= iconv_open(ms_wcCharsetName
, cname
);
922 // last bet, try if it knows WCHAR_T pseudo-charset
923 if (m2w
== (iconv_t
)-1)
925 ms_wcCharsetName
= "WCHAR_T";
926 m2w
= iconv_open(ms_wcCharsetName
, cname
);
929 if (m2w
!= (iconv_t
)-1)
931 char buf
[2], *bufPtr
;
932 wchar_t wbuf
[2], *wbufPtr
;
940 outsz
= SIZEOF_WCHAR_T
* 2;
944 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
945 (char**)&wbufPtr
, &outsz
);
947 if (ICONV_FAILED(res
, insz
))
949 ms_wcCharsetName
= NULL
;
950 wxLogLastError(wxT("iconv"));
951 wxLogError(_("Conversion to charset '%s' doesn't work."), name
);
955 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
960 ms_wcCharsetName
= NULL
;
962 // VS: we must not output an error here, since wxWidgets will safely
963 // fall back to using wxEncodingConverter.
964 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name
);
968 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName
, ms_wcNeedsSwap
);
970 else // we already have ms_wcCharsetName
972 m2w
= iconv_open(ms_wcCharsetName
, cname
);
975 // NB: don't ever pass NULL to iconv_open(), it may crash!
976 if ( ms_wcCharsetName
)
978 w2m
= iconv_open( cname
, ms_wcCharsetName
);
986 wxMBConv_iconv::~wxMBConv_iconv()
988 if ( m2w
!= (iconv_t
)-1 )
990 if ( w2m
!= (iconv_t
)-1 )
994 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
996 size_t inbuf
= strlen(psz
);
997 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
999 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1000 wchar_t *bufPtr
= buf
;
1001 const char *pszPtr
= psz
;
1005 // have destination buffer, convert there
1007 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1008 (char**)&bufPtr
, &outbuf
);
1009 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1013 // convert to native endianness
1014 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
1017 // NB: iconv was given only strlen(psz) characters on input, and so
1018 // it couldn't convert the trailing zero. Let's do it ourselves
1019 // if there's some room left for it in the output buffer.
1025 // no destination buffer... convert using temp buffer
1026 // to calculate destination buffer requirement
1031 outbuf
= 8*SIZEOF_WCHAR_T
;
1034 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1035 (char**)&bufPtr
, &outbuf
);
1037 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1038 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1041 if (ICONV_FAILED(cres
, inbuf
))
1043 //VS: it is ok if iconv fails, hence trace only
1044 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1051 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1053 size_t inbuf
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1057 wchar_t *tmpbuf
= 0;
1061 // need to copy to temp buffer to switch endianness
1062 // this absolutely doesn't rock!
1063 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1064 // could be in read-only memory, or be accessed in some other thread)
1065 tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
1066 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
1067 WC_BSWAP(tmpbuf
, inbuf
)
1073 // have destination buffer, convert there
1074 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1078 // NB: iconv was given only wcslen(psz) characters on input, and so
1079 // it couldn't convert the trailing zero. Let's do it ourselves
1080 // if there's some room left for it in the output buffer.
1086 // no destination buffer... convert using temp buffer
1087 // to calculate destination buffer requirement
1091 buf
= tbuf
; outbuf
= 16;
1093 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1096 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1104 if (ICONV_FAILED(cres
, inbuf
))
1106 //VS: it is ok if iconv fails, hence trace only
1107 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1114 #endif // HAVE_ICONV
1117 // ============================================================================
1118 // Win32 conversion classes
1119 // ============================================================================
1121 #ifdef wxHAVE_WIN32_MB2WC
1125 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1126 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1129 class wxMBConv_win32
: public wxMBConv
1134 m_CodePage
= CP_ACP
;
1138 wxMBConv_win32(const wxChar
* name
)
1140 m_CodePage
= wxCharsetToCodepage(name
);
1143 wxMBConv_win32(wxFontEncoding encoding
)
1145 m_CodePage
= wxEncodingToCodepage(encoding
);
1149 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1151 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1152 // the behaviour is not compatible with the Unix version (using iconv)
1153 // and break the library itself, e.g. wxTextInputStream::NextChar()
1154 // wouldn't work if reading an incomplete MB char didn't result in an
1156 const size_t len
= ::MultiByteToWideChar
1158 m_CodePage
, // code page
1159 MB_ERR_INVALID_CHARS
, // flags: fall on error
1160 psz
, // input string
1161 -1, // its length (NUL-terminated)
1162 buf
, // output string
1163 buf
? n
: 0 // size of output buffer
1166 // note that it returns count of written chars for buf != NULL and size
1167 // of the needed buffer for buf == NULL so in either case the length of
1168 // the string (which never includes the terminating NUL) is one less
1169 return len
? len
- 1 : (size_t)-1;
1172 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1175 we have a problem here: by default, WideCharToMultiByte() may
1176 replace characters unrepresentable in the target code page with bad
1177 quality approximations such as turning "1/2" symbol (U+00BD) into
1178 "1" for the code pages which don't have it and we, obviously, want
1179 to avoid this at any price
1181 the trouble is that this function does it _silently_, i.e. it won't
1182 even tell us whether it did or not... Win98/2000 and higher provide
1183 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1184 we have to resort to a round trip, i.e. check that converting back
1185 results in the same string -- this is, of course, expensive but
1186 otherwise we simply can't be sure to not garble the data.
1189 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1190 // it doesn't work with CJK encodings (which we test for rather roughly
1191 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1193 BOOL usedDef
wxDUMMY_INITIALIZE(false),
1196 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1198 // it's our lucky day
1199 flags
= WC_NO_BEST_FIT_CHARS
;
1200 pUsedDef
= &usedDef
;
1202 else // old system or unsupported encoding
1208 const size_t len
= ::WideCharToMultiByte
1210 m_CodePage
, // code page
1211 flags
, // either none or no best fit
1212 pwz
, // input string
1213 -1, // it is (wide) NUL-terminated
1214 buf
, // output buffer
1215 buf
? n
: 0, // and its size
1216 NULL
, // default "replacement" char
1217 pUsedDef
// [out] was it used?
1222 // function totally failed
1226 // if we were really converting, check if we succeeded
1231 // check if the conversion failed, i.e. if any replacements
1236 else // we must resort to double tripping...
1238 wxWCharBuffer
wcBuf(n
);
1239 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1240 wcscmp(wcBuf
, pwz
) != 0 )
1242 // we didn't obtain the same thing we started from, hence
1243 // the conversion was lossy and we consider that it failed
1249 // see the comment above for the reason of "len - 1"
1253 bool IsOk() const { return m_CodePage
!= -1; }
1256 static bool CanUseNoBestFit()
1258 static int s_isWin98Or2k
= -1;
1260 if ( s_isWin98Or2k
== -1 )
1263 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1266 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1270 s_isWin98Or2k
= verMaj
>= 5;
1274 // unknown, be conseravtive by default
1278 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1281 return s_isWin98Or2k
== 1;
1287 #endif // wxHAVE_WIN32_MB2WC
1289 // ============================================================================
1290 // Mac conversion classes
1291 // ============================================================================
1293 #if defined(__WXMAC__) && defined(TARGET_CARBON)
1295 class wxMBConv_mac
: public wxMBConv
1300 Init(CFStringGetSystemEncoding()) ;
1303 wxMBConv_mac(const wxChar
* name
)
1305 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, false) ) ) ;
1308 wxMBConv_mac(wxFontEncoding encoding
)
1310 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
1315 OSStatus status
= noErr
;
1316 status
= TECDisposeConverter(m_MB2WC_converter
);
1317 status
= TECDisposeConverter(m_WC2MB_converter
);
1321 void Init( TextEncodingBase encoding
)
1323 OSStatus status
= noErr
;
1324 m_char_encoding
= encoding
;
1325 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
1327 status
= TECCreateConverter(&m_MB2WC_converter
,
1329 m_unicode_encoding
);
1330 status
= TECCreateConverter(&m_WC2MB_converter
,
1335 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1337 OSStatus status
= noErr
;
1338 ByteCount byteOutLen
;
1339 ByteCount byteInLen
= strlen(psz
) ;
1340 wchar_t *tbuf
= NULL
;
1341 UniChar
* ubuf
= NULL
;
1347 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
1349 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
1350 #if SIZEOF_WCHAR_T == 4
1351 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
1353 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
1355 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
1356 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
1357 #if SIZEOF_WCHAR_T == 4
1358 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
1359 // is not properly terminated we get random characters at the end
1360 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
1361 wxMBConvUTF16BE converter
;
1362 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
1365 res
= byteOutLen
/ sizeof( UniChar
) ;
1370 if ( buf
&& res
< n
)
1376 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1378 OSStatus status
= noErr
;
1379 ByteCount byteOutLen
;
1380 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1388 tbuf
= (char*) malloc( n
) ;
1391 ByteCount byteBufferLen
= n
;
1392 UniChar
* ubuf
= NULL
;
1393 #if SIZEOF_WCHAR_T == 4
1394 wxMBConvUTF16BE converter
;
1395 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
1396 byteInLen
= unicharlen
;
1397 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
1398 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
1400 ubuf
= (UniChar
*) psz
;
1402 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
1403 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
1404 #if SIZEOF_WCHAR_T == 4
1410 size_t res
= byteOutLen
;
1411 if ( buf
&& res
< n
)
1418 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
1421 TECObjectRef m_MB2WC_converter
;
1422 TECObjectRef m_WC2MB_converter
;
1424 TextEncodingBase m_char_encoding
;
1425 TextEncodingBase m_unicode_encoding
;
1428 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1430 // ============================================================================
1431 // wxEncodingConverter based conversion classes
1432 // ============================================================================
1436 class wxMBConv_wxwin
: public wxMBConv
1441 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
1442 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
1446 // temporarily just use wxEncodingConverter stuff,
1447 // so that it works while a better implementation is built
1448 wxMBConv_wxwin(const wxChar
* name
)
1451 m_enc
= wxFontMapper::Get()->CharsetToEncoding(name
, false);
1453 m_enc
= wxFONTENCODING_SYSTEM
;
1458 wxMBConv_wxwin(wxFontEncoding enc
)
1465 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
1467 size_t inbuf
= strlen(psz
);
1469 m2w
.Convert(psz
,buf
);
1473 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
1475 const size_t inbuf
= wxWcslen(psz
);
1477 w2m
.Convert(psz
,buf
);
1482 bool IsOk() const { return m_ok
; }
1485 wxFontEncoding m_enc
;
1486 wxEncodingConverter m2w
, w2m
;
1488 // were we initialized successfully?
1491 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
1494 #endif // wxUSE_FONTMAP
1496 // ============================================================================
1497 // wxCSConv implementation
1498 // ============================================================================
1500 void wxCSConv::Init()
1507 wxCSConv::wxCSConv(const wxChar
*charset
)
1516 m_encoding
= wxFONTENCODING_SYSTEM
;
1519 wxCSConv::wxCSConv(wxFontEncoding encoding
)
1521 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
1523 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
1525 encoding
= wxFONTENCODING_SYSTEM
;
1530 m_encoding
= encoding
;
1533 wxCSConv::~wxCSConv()
1538 wxCSConv::wxCSConv(const wxCSConv
& conv
)
1543 SetName(conv
.m_name
);
1544 m_encoding
= conv
.m_encoding
;
1547 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
1551 SetName(conv
.m_name
);
1552 m_encoding
= conv
.m_encoding
;
1557 void wxCSConv::Clear()
1566 void wxCSConv::SetName(const wxChar
*charset
)
1570 m_name
= wxStrdup(charset
);
1575 wxMBConv
*wxCSConv::DoCreate() const
1577 // check for the special case of ASCII or ISO8859-1 charset: as we have
1578 // special knowledge of it anyhow, we don't need to create a special
1579 // conversion object
1580 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
1582 // don't convert at all
1586 // we trust OS to do conversion better than we can so try external
1587 // conversion methods first
1589 // the full order is:
1590 // 1. OS conversion (iconv() under Unix or Win32 API)
1591 // 2. hard coded conversions for UTF
1592 // 3. wxEncodingConverter as fall back
1598 #endif // !wxUSE_FONTMAP
1600 wxString
name(m_name
);
1604 name
= wxFontMapper::Get()->GetEncodingName(m_encoding
);
1605 #endif // wxUSE_FONTMAP
1607 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
1613 #endif // HAVE_ICONV
1615 #ifdef wxHAVE_WIN32_MB2WC
1618 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
1619 : new wxMBConv_win32(m_encoding
);
1628 #endif // wxHAVE_WIN32_MB2WC
1629 #if defined(__WXMAC__)
1631 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
) )
1634 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
1635 : new wxMBConv_mac(m_encoding
);
1644 wxFontEncoding enc
= m_encoding
;
1646 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
1648 // use "false" to suppress interactive dialogs -- we can be called from
1649 // anywhere and popping up a dialog from here is the last thing we want to
1651 enc
= wxFontMapper::Get()->CharsetToEncoding(m_name
, false);
1653 #endif // wxUSE_FONTMAP
1657 case wxFONTENCODING_UTF7
:
1658 return new wxMBConvUTF7
;
1660 case wxFONTENCODING_UTF8
:
1661 return new wxMBConvUTF8
;
1663 case wxFONTENCODING_UTF16BE
:
1664 return new wxMBConvUTF16BE
;
1666 case wxFONTENCODING_UTF16LE
:
1667 return new wxMBConvUTF16LE
;
1669 case wxFONTENCODING_UTF32BE
:
1670 return new wxMBConvUTF32BE
;
1672 case wxFONTENCODING_UTF32LE
:
1673 return new wxMBConvUTF32LE
;
1676 // nothing to do but put here to suppress gcc warnings
1683 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
1684 : new wxMBConv_wxwin(m_encoding
);
1690 #endif // wxUSE_FONTMAP
1692 // NB: This is a hack to prevent deadlock. What could otherwise happen
1693 // in Unicode build: wxConvLocal creation ends up being here
1694 // because of some failure and logs the error. But wxLog will try to
1695 // attach timestamp, for which it will need wxConvLocal (to convert
1696 // time to char* and then wchar_t*), but that fails, tries to log
1697 // error, but wxLog has a (already locked) critical section that
1698 // guards static buffer.
1699 static bool alreadyLoggingError
= false;
1700 if (!alreadyLoggingError
)
1702 alreadyLoggingError
= true;
1703 wxLogError(_("Cannot convert from the charset '%s'!"),
1707 wxFontMapper::GetEncodingDescription(m_encoding
).c_str()
1708 #else // !wxUSE_FONTMAP
1709 wxString::Format(_("encoding %s"), m_encoding
).c_str()
1710 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1712 alreadyLoggingError
= false;
1718 void wxCSConv::CreateConvIfNeeded() const
1722 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
1725 // if we don't have neither the name nor the encoding, use the default
1726 // encoding for this system
1727 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
1729 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
1731 #endif // wxUSE_INTL
1733 self
->m_convReal
= DoCreate();
1734 self
->m_deferred
= false;
1738 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1740 CreateConvIfNeeded();
1743 return m_convReal
->MB2WC(buf
, psz
, n
);
1746 size_t len
= strlen(psz
);
1750 for (size_t c
= 0; c
<= len
; c
++)
1751 buf
[c
] = (unsigned char)(psz
[c
]);
1757 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1759 CreateConvIfNeeded();
1762 return m_convReal
->WC2MB(buf
, psz
, n
);
1765 const size_t len
= wxWcslen(psz
);
1768 for (size_t c
= 0; c
<= len
; c
++)
1777 for (size_t c
= 0; c
<= len
; c
++)
1787 // ----------------------------------------------------------------------------
1789 // ----------------------------------------------------------------------------
1792 static wxMBConv_win32 wxConvLibcObj
;
1793 #elif defined(__WXMAC__) && !defined(__MACH__)
1794 static wxMBConv_mac wxConvLibcObj
;
1796 static wxMBConvLibc wxConvLibcObj
;
1799 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
1800 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
1801 static wxMBConvUTF7 wxConvUTF7Obj
;
1802 static wxMBConvUTF8 wxConvUTF8Obj
;
1805 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
1806 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
1807 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
1808 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
1809 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
1810 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
1812 #else // !wxUSE_WCHAR_T
1814 // stand-ins in absence of wchar_t
1815 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
1820 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T