1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
13 // ============================================================================
15 // ============================================================================
17 // ----------------------------------------------------------------------------
19 // ----------------------------------------------------------------------------
21 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
22 #pragma implementation "strconv.h"
25 // For compilers that support precompilation, includes "wx.h".
26 #include "wx/wxprec.h"
37 #include "wx/strconv.h"
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
54 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
55 #define wxHAVE_WIN32_MB2WC
56 #endif // __WIN32__ but !__WXMICROWIN__
58 // ----------------------------------------------------------------------------
60 // ----------------------------------------------------------------------------
70 #include "wx/encconv.h"
71 #include "wx/fontmap.h"
74 #include <ATSUnicode.h>
75 #include <TextCommon.h>
76 #include <TextEncodingConverter.h>
78 #include "wx/mac/private.h" // includes mac headers
80 // ----------------------------------------------------------------------------
82 // ----------------------------------------------------------------------------
84 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
85 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
87 #if SIZEOF_WCHAR_T == 4
88 #define WC_NAME "UCS4"
89 #define WC_BSWAP BSWAP_UCS4
90 #ifdef WORDS_BIGENDIAN
91 #define WC_NAME_BEST "UCS-4BE"
93 #define WC_NAME_BEST "UCS-4LE"
95 #elif SIZEOF_WCHAR_T == 2
96 #define WC_NAME "UTF16"
97 #define WC_BSWAP BSWAP_UTF16
99 #ifdef WORDS_BIGENDIAN
100 #define WC_NAME_BEST "UTF-16BE"
102 #define WC_NAME_BEST "UTF-16LE"
104 #else // sizeof(wchar_t) != 2 nor 4
105 // does this ever happen?
106 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
109 // ============================================================================
111 // ============================================================================
113 // ----------------------------------------------------------------------------
114 // UTF-16 en/decoding to/from UCS-4
115 // ----------------------------------------------------------------------------
118 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
123 *output
= (wxUint16
) input
;
126 else if (input
>=0x110000)
134 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
135 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
141 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
143 if ((*input
<0xd800) || (*input
>0xdfff))
148 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
155 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
161 // ----------------------------------------------------------------------------
163 // ----------------------------------------------------------------------------
165 wxMBConv::~wxMBConv()
167 // nothing to do here
170 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
174 // calculate the length of the buffer needed first
175 size_t nLen
= MB2WC(NULL
, psz
, 0);
176 if ( nLen
!= (size_t)-1 )
178 // now do the actual conversion
179 wxWCharBuffer
buf(nLen
);
180 nLen
= MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NULL
181 if ( nLen
!= (size_t)-1 )
188 wxWCharBuffer
buf((wchar_t *)NULL
);
193 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
197 size_t nLen
= WC2MB(NULL
, pwz
, 0);
198 if ( nLen
!= (size_t)-1 )
200 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
201 nLen
= WC2MB(buf
.data(), pwz
, nLen
+ 4);
202 if ( nLen
!= (size_t)-1 )
209 wxCharBuffer
buf((char *)NULL
);
214 // ----------------------------------------------------------------------------
216 // ----------------------------------------------------------------------------
218 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
220 return wxMB2WC(buf
, psz
, n
);
223 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
225 return wxWC2MB(buf
, psz
, n
);
228 // ----------------------------------------------------------------------------
230 // ----------------------------------------------------------------------------
233 static char utf7_setD
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
234 "abcdefghijklmnopqrstuvwxyz"
235 "0123456789'(),-./:?";
236 static char utf7_setO
[]="!\"#$%&*;<=>@[]^_`{|}";
237 static char utf7_setB
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
238 "abcdefghijklmnopqrstuvwxyz"
242 // TODO: write actual implementations of UTF-7 here
243 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf
),
244 const char * WXUNUSED(psz
),
245 size_t WXUNUSED(n
)) const
250 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf
),
251 const wchar_t * WXUNUSED(psz
),
252 size_t WXUNUSED(n
)) const
257 // ----------------------------------------------------------------------------
259 // ----------------------------------------------------------------------------
261 static wxUint32 utf8_max
[]=
262 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
264 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
268 while (*psz
&& ((!buf
) || (len
< n
)))
270 unsigned char cc
= *psz
++, fc
= cc
;
272 for (cnt
= 0; fc
& 0x80; cnt
++)
286 // invalid UTF-8 sequence
291 unsigned ocnt
= cnt
- 1;
292 wxUint32 res
= cc
& (0x3f >> cnt
);
296 if ((cc
& 0xC0) != 0x80)
298 // invalid UTF-8 sequence
301 res
= (res
<< 6) | (cc
& 0x3f);
303 if (res
<= utf8_max
[ocnt
])
305 // illegal UTF-8 encoding
309 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
310 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
311 if (pa
== (size_t)-1)
320 #endif // WC_UTF16/!WC_UTF16
324 if (buf
&& (len
< n
))
329 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
333 while (*psz
&& ((!buf
) || (len
< n
)))
337 // cast is ok for WC_UTF16
338 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
339 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
341 cc
=(*psz
++) & 0x7fffffff;
344 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
358 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
360 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
365 if (buf
&& (len
<n
)) *buf
= 0;
373 // ----------------------------------------------------------------------------
375 // ----------------------------------------------------------------------------
377 #ifdef WORDS_BIGENDIAN
378 #define wxMBConvUTF16straight wxMBConvUTF16BE
379 #define wxMBConvUTF16swap wxMBConvUTF16LE
381 #define wxMBConvUTF16swap wxMBConvUTF16BE
382 #define wxMBConvUTF16straight wxMBConvUTF16LE
388 // copy 16bit MB to 16bit String
389 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
393 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
396 *buf
++ = *(wxUint16
*)psz
;
399 psz
+= sizeof(wxUint16
);
401 if (buf
&& len
<n
) *buf
=0;
407 // copy 16bit String to 16bit MB
408 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
412 while (*psz
&& (!buf
|| len
< n
))
416 *(wxUint16
*)buf
= *psz
;
417 buf
+= sizeof(wxUint16
);
419 len
+= sizeof(wxUint16
);
422 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
428 // swap 16bit MB to 16bit String
429 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
433 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
437 ((char *)buf
)[0] = psz
[1];
438 ((char *)buf
)[1] = psz
[0];
442 psz
+= sizeof(wxUint16
);
444 if (buf
&& len
<n
) *buf
=0;
450 // swap 16bit MB to 16bit String
451 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
455 while (*psz
&& (!buf
|| len
< n
))
459 *buf
++ = ((char*)psz
)[1];
460 *buf
++ = ((char*)psz
)[0];
462 len
+= sizeof(wxUint16
);
465 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
474 // copy 16bit MB to 32bit String
475 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
479 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
482 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
483 if (pa
== (size_t)-1)
489 psz
+= pa
* sizeof(wxUint16
);
491 if (buf
&& len
<n
) *buf
=0;
497 // copy 32bit String to 16bit MB
498 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
502 while (*psz
&& (!buf
|| len
< n
))
505 size_t pa
=encode_utf16(*psz
, cc
);
507 if (pa
== (size_t)-1)
512 *(wxUint16
*)buf
= cc
[0];
513 buf
+= sizeof(wxUint16
);
516 *(wxUint16
*)buf
= cc
[1];
517 buf
+= sizeof(wxUint16
);
521 len
+= pa
*sizeof(wxUint16
);
524 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
530 // swap 16bit MB to 32bit String
531 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
535 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
539 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
540 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
542 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
543 if (pa
== (size_t)-1)
550 psz
+= pa
* sizeof(wxUint16
);
552 if (buf
&& len
<n
) *buf
=0;
558 // swap 32bit String to 16bit MB
559 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
563 while (*psz
&& (!buf
|| len
< n
))
566 size_t pa
=encode_utf16(*psz
, cc
);
568 if (pa
== (size_t)-1)
573 *buf
++ = ((char*)cc
)[1];
574 *buf
++ = ((char*)cc
)[0];
577 *buf
++ = ((char*)cc
)[3];
578 *buf
++ = ((char*)cc
)[2];
582 len
+= pa
*sizeof(wxUint16
);
585 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
593 // ----------------------------------------------------------------------------
595 // ----------------------------------------------------------------------------
597 #ifdef WORDS_BIGENDIAN
598 #define wxMBConvUTF32straight wxMBConvUTF32BE
599 #define wxMBConvUTF32swap wxMBConvUTF32LE
601 #define wxMBConvUTF32swap wxMBConvUTF32BE
602 #define wxMBConvUTF32straight wxMBConvUTF32LE
606 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
607 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
612 // copy 32bit MB to 16bit String
613 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
617 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
621 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
622 if (pa
== (size_t)-1)
632 psz
+= sizeof(wxUint32
);
634 if (buf
&& len
<n
) *buf
=0;
640 // copy 16bit String to 32bit MB
641 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
645 while (*psz
&& (!buf
|| len
< n
))
649 // cast is ok for WC_UTF16
650 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
651 if (pa
== (size_t)-1)
656 *(wxUint32
*)buf
= cc
;
657 buf
+= sizeof(wxUint32
);
659 len
+= sizeof(wxUint32
);
663 if (buf
&& len
<=n
-sizeof(wxUint32
))
671 // swap 32bit MB to 16bit String
672 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
676 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
679 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
680 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
685 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
686 if (pa
== (size_t)-1)
696 psz
+= sizeof(wxUint32
);
706 // swap 16bit String to 32bit MB
707 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
711 while (*psz
&& (!buf
|| len
< n
))
715 // cast is ok for WC_UTF16
716 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
717 if (pa
== (size_t)-1)
727 len
+= sizeof(wxUint32
);
731 if (buf
&& len
<=n
-sizeof(wxUint32
))
740 // copy 32bit MB to 32bit String
741 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
745 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
748 *buf
++ = *(wxUint32
*)psz
;
750 psz
+= sizeof(wxUint32
);
760 // copy 32bit String to 32bit MB
761 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
765 while (*psz
&& (!buf
|| len
< n
))
769 *(wxUint32
*)buf
= *psz
;
770 buf
+= sizeof(wxUint32
);
773 len
+= sizeof(wxUint32
);
777 if (buf
&& len
<=n
-sizeof(wxUint32
))
784 // swap 32bit MB to 32bit String
785 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
789 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
793 ((char *)buf
)[0] = psz
[3];
794 ((char *)buf
)[1] = psz
[2];
795 ((char *)buf
)[2] = psz
[1];
796 ((char *)buf
)[3] = psz
[0];
800 psz
+= sizeof(wxUint32
);
810 // swap 32bit String to 32bit MB
811 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
815 while (*psz
&& (!buf
|| len
< n
))
819 *buf
++ = ((char *)psz
)[3];
820 *buf
++ = ((char *)psz
)[2];
821 *buf
++ = ((char *)psz
)[1];
822 *buf
++ = ((char *)psz
)[0];
824 len
+= sizeof(wxUint32
);
828 if (buf
&& len
<=n
-sizeof(wxUint32
))
838 // ============================================================================
839 // The classes doing conversion using the iconv_xxx() functions
840 // ============================================================================
844 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
845 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
846 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
847 // (which means error) and says there are 0 bytes left in the input buffer --
848 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
849 // this alternative test for iconv() failure.
850 // [This bug does not appear in glibc 2.2.]
851 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
852 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
853 (errno != E2BIG || bufLeft != 0))
855 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
858 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
860 // ----------------------------------------------------------------------------
861 // wxMBConv_iconv: encapsulates an iconv character set
862 // ----------------------------------------------------------------------------
864 class wxMBConv_iconv
: public wxMBConv
867 wxMBConv_iconv(const wxChar
*name
);
868 virtual ~wxMBConv_iconv();
870 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
871 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
874 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
877 // the iconv handlers used to translate from multibyte to wide char and in
878 // the other direction
883 // the name (for iconv_open()) of a wide char charset -- if none is
884 // available on this machine, it will remain NULL
885 static const char *ms_wcCharsetName
;
887 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
888 // different endian-ness than the native one
889 static bool ms_wcNeedsSwap
;
892 const char *wxMBConv_iconv::ms_wcCharsetName
= NULL
;
893 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
895 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
897 // Do it the hard way
899 for (size_t i
= 0; i
< wxStrlen(name
)+1; i
++)
900 cname
[i
] = (char) name
[i
];
902 // check for charset that represents wchar_t:
903 if (ms_wcCharsetName
== NULL
)
905 ms_wcNeedsSwap
= false;
907 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
908 ms_wcCharsetName
= WC_NAME_BEST
;
909 m2w
= iconv_open(ms_wcCharsetName
, cname
);
911 if (m2w
== (iconv_t
)-1)
913 // try charset w/o bytesex info (e.g. "UCS4")
914 // and check for bytesex ourselves:
915 ms_wcCharsetName
= WC_NAME
;
916 m2w
= iconv_open(ms_wcCharsetName
, cname
);
918 // last bet, try if it knows WCHAR_T pseudo-charset
919 if (m2w
== (iconv_t
)-1)
921 ms_wcCharsetName
= "WCHAR_T";
922 m2w
= iconv_open(ms_wcCharsetName
, cname
);
925 if (m2w
!= (iconv_t
)-1)
927 char buf
[2], *bufPtr
;
928 wchar_t wbuf
[2], *wbufPtr
;
936 outsz
= SIZEOF_WCHAR_T
* 2;
940 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
941 (char**)&wbufPtr
, &outsz
);
943 if (ICONV_FAILED(res
, insz
))
945 ms_wcCharsetName
= NULL
;
946 wxLogLastError(wxT("iconv"));
947 wxLogError(_("Conversion to charset '%s' doesn't work."), name
);
951 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
956 ms_wcCharsetName
= NULL
;
958 // VS: we must not output an error here, since wxWindows will safely
959 // fall back to using wxEncodingConverter.
960 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name
);
964 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName
, ms_wcNeedsSwap
);
966 else // we already have ms_wcCharsetName
968 m2w
= iconv_open(ms_wcCharsetName
, cname
);
971 // NB: don't ever pass NULL to iconv_open(), it may crash!
972 if ( ms_wcCharsetName
)
974 w2m
= iconv_open( cname
, ms_wcCharsetName
);
982 wxMBConv_iconv::~wxMBConv_iconv()
984 if ( m2w
!= (iconv_t
)-1 )
986 if ( w2m
!= (iconv_t
)-1 )
990 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
992 size_t inbuf
= strlen(psz
);
993 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
995 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
996 wchar_t *bufPtr
= buf
;
997 const char *pszPtr
= psz
;
1001 // have destination buffer, convert there
1003 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1004 (char**)&bufPtr
, &outbuf
);
1005 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1009 // convert to native endianness
1010 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
1013 // NB: iconv was given only strlen(psz) characters on input, and so
1014 // it couldn't convert the trailing zero. Let's do it ourselves
1015 // if there's some room left for it in the output buffer.
1021 // no destination buffer... convert using temp buffer
1022 // to calculate destination buffer requirement
1027 outbuf
= 8*SIZEOF_WCHAR_T
;
1030 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1031 (char**)&bufPtr
, &outbuf
);
1033 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1034 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1037 if (ICONV_FAILED(cres
, inbuf
))
1039 //VS: it is ok if iconv fails, hence trace only
1040 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1047 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1049 size_t inbuf
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1053 wchar_t *tmpbuf
= 0;
1057 // need to copy to temp buffer to switch endianness
1058 // this absolutely doesn't rock!
1059 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1060 // could be in read-only memory, or be accessed in some other thread)
1061 tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
1062 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
1063 WC_BSWAP(tmpbuf
, inbuf
)
1069 // have destination buffer, convert there
1070 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1074 // NB: iconv was given only wcslen(psz) characters on input, and so
1075 // it couldn't convert the trailing zero. Let's do it ourselves
1076 // if there's some room left for it in the output buffer.
1082 // no destination buffer... convert using temp buffer
1083 // to calculate destination buffer requirement
1087 buf
= tbuf
; outbuf
= 16;
1089 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1092 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1100 if (ICONV_FAILED(cres
, inbuf
))
1102 //VS: it is ok if iconv fails, hence trace only
1103 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1110 #endif // HAVE_ICONV
1113 // ============================================================================
1114 // Win32 conversion classes
1115 // ============================================================================
1117 #ifdef wxHAVE_WIN32_MB2WC
1120 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1121 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1123 class wxMBConv_win32
: public wxMBConv
1128 m_CodePage
= CP_ACP
;
1131 wxMBConv_win32(const wxChar
* name
)
1133 m_CodePage
= wxCharsetToCodepage(name
);
1136 wxMBConv_win32(wxFontEncoding encoding
)
1138 m_CodePage
= wxEncodingToCodepage(encoding
);
1141 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1143 const size_t len
= ::MultiByteToWideChar
1145 m_CodePage
, // code page
1147 psz
, // input string
1148 -1, // its length (NUL-terminated)
1149 buf
, // output string
1150 buf
? n
: 0 // size of output buffer
1153 // note that it returns count of written chars for buf != NULL and size
1154 // of the needed buffer for buf == NULL so in either case the length of
1155 // the string (which never includes the terminating NUL) is one less
1156 return len
? len
- 1 : (size_t)-1;
1159 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1162 we have a problem here: by default, WideCharToMultiByte() may
1163 replace characters unrepresentable in the target code page with bad
1164 quality approximations such as turning "1/2" symbol (U+00BD) into
1165 "1" for the code pages which don't have it and we, obviously, want
1166 to avoid this at any price
1168 the trouble is that this function does it _silently_, i.e. it won't
1169 even tell us whether it did or not... Win98/2000 and higher provide
1170 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1171 we have to resort to a round trip, i.e. check that converting back
1172 results in the same string -- this is, of course, expensive but
1173 otherwise we simply can't be sure to not garble the data.
1176 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1177 // it doesn't work with CJK encodings (which we test for rather roughly
1178 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1180 BOOL usedDef
wxDUMMY_INITIALIZE(false),
1183 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1185 // it's our lucky day
1186 flags
= WC_NO_BEST_FIT_CHARS
;
1187 pUsedDef
= &usedDef
;
1189 else // old system or unsupported encoding
1195 const size_t len
= ::WideCharToMultiByte
1197 m_CodePage
, // code page
1198 flags
, // either none or no best fit
1199 pwz
, // input string
1200 -1, // it is (wide) NUL-terminated
1201 buf
, // output buffer
1202 buf
? n
: 0, // and its size
1203 NULL
, // default "replacement" char
1204 pUsedDef
// [out] was it used?
1209 // function totally failed
1213 // if we were really converting, check if we succeeded
1218 // check if the conversion failed, i.e. if any replacements
1223 else // we must resort to double tripping...
1225 wxWCharBuffer
wcBuf(n
);
1226 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1227 wcscmp(wcBuf
, pwz
) != 0 )
1229 // we didn't obtain the same thing we started from, hence
1230 // the conversion was lossy and we consider that it failed
1236 // see the comment above for the reason of "len - 1"
1240 bool IsOk() const { return m_CodePage
!= -1; }
1243 static bool CanUseNoBestFit()
1245 static int s_isWin98Or2k
= -1;
1247 if ( s_isWin98Or2k
== -1 )
1250 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1253 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1257 s_isWin98Or2k
= verMaj
>= 5;
1261 // unknown, be conseravtive by default
1265 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1268 return s_isWin98Or2k
== 1;
1274 #endif // wxHAVE_WIN32_MB2WC
1276 // ============================================================================
1277 // Mac conversion classes
1278 // ============================================================================
1280 #if defined(__WXMAC__) && defined(TARGET_CARBON)
1282 class wxMBConv_mac
: public wxMBConv
1287 Init(CFStringGetSystemEncoding()) ;
1290 wxMBConv_mac(const wxChar
* name
)
1292 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, FALSE
) ) ) ;
1295 wxMBConv_mac(wxFontEncoding encoding
)
1297 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
1302 OSStatus status
= noErr
;
1303 status
= TECDisposeConverter(m_MB2WC_converter
);
1304 status
= TECDisposeConverter(m_WC2MB_converter
);
1308 void Init( TextEncodingBase encoding
)
1310 OSStatus status
= noErr
;
1311 m_char_encoding
= encoding
;
1312 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
1314 status
= TECCreateConverter(&m_MB2WC_converter
,
1316 m_unicode_encoding
);
1317 status
= TECCreateConverter(&m_WC2MB_converter
,
1322 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1324 OSStatus status
= noErr
;
1325 ByteCount byteOutLen
;
1326 ByteCount byteInLen
= strlen(psz
) ;
1327 wchar_t *tbuf
= NULL
;
1328 UniChar
* ubuf
= NULL
;
1334 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
1336 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
1337 #if SIZEOF_WCHAR_T == 4
1338 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
1340 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
1342 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
1343 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
1344 #if SIZEOF_WCHAR_T == 4
1345 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
1346 // is not properly terminated we get random characters at the end
1347 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
1348 wxMBConvUTF16BE converter
;
1349 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
1352 res
= byteOutLen
/ sizeof( UniChar
) ;
1357 if ( buf
&& res
< n
)
1363 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1365 OSStatus status
= noErr
;
1366 ByteCount byteOutLen
;
1367 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1375 tbuf
= (char*) malloc( n
) ;
1378 ByteCount byteBufferLen
= n
;
1379 UniChar
* ubuf
= NULL
;
1380 #if SIZEOF_WCHAR_T == 4
1381 wxMBConvUTF16BE converter
;
1382 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
1383 byteInLen
= unicharlen
;
1384 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
1385 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
1387 ubuf
= (UniChar
*) psz
;
1389 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
1390 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
1391 #if SIZEOF_WCHAR_T == 4
1397 size_t res
= byteOutLen
;
1398 if ( buf
&& res
< n
)
1405 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
1408 TECObjectRef m_MB2WC_converter
;
1409 TECObjectRef m_WC2MB_converter
;
1411 TextEncodingBase m_char_encoding
;
1412 TextEncodingBase m_unicode_encoding
;
1415 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1417 // ============================================================================
1418 // wxEncodingConverter based conversion classes
1419 // ============================================================================
1423 class wxMBConv_wxwin
: public wxMBConv
1428 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
1429 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
1433 // temporarily just use wxEncodingConverter stuff,
1434 // so that it works while a better implementation is built
1435 wxMBConv_wxwin(const wxChar
* name
)
1438 m_enc
= wxFontMapper::Get()->CharsetToEncoding(name
, false);
1440 m_enc
= wxFONTENCODING_SYSTEM
;
1445 wxMBConv_wxwin(wxFontEncoding enc
)
1452 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
1454 size_t inbuf
= strlen(psz
);
1456 m2w
.Convert(psz
,buf
);
1460 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
1462 const size_t inbuf
= wxWcslen(psz
);
1464 w2m
.Convert(psz
,buf
);
1469 bool IsOk() const { return m_ok
; }
1472 wxFontEncoding m_enc
;
1473 wxEncodingConverter m2w
, w2m
;
1475 // were we initialized successfully?
1478 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
1481 #endif // wxUSE_FONTMAP
1483 // ============================================================================
1484 // wxCSConv implementation
1485 // ============================================================================
1487 void wxCSConv::Init()
1494 wxCSConv::wxCSConv(const wxChar
*charset
)
1503 m_encoding
= wxFONTENCODING_SYSTEM
;
1506 wxCSConv::wxCSConv(wxFontEncoding encoding
)
1508 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
1510 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
1512 encoding
= wxFONTENCODING_SYSTEM
;
1517 m_encoding
= encoding
;
1520 wxCSConv::~wxCSConv()
1525 wxCSConv::wxCSConv(const wxCSConv
& conv
)
1530 SetName(conv
.m_name
);
1531 m_encoding
= conv
.m_encoding
;
1534 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
1538 SetName(conv
.m_name
);
1539 m_encoding
= conv
.m_encoding
;
1544 void wxCSConv::Clear()
1553 void wxCSConv::SetName(const wxChar
*charset
)
1557 m_name
= wxStrdup(charset
);
1562 wxMBConv
*wxCSConv::DoCreate() const
1564 // check for the special case of ASCII or ISO8859-1 charset: as we have
1565 // special knowledge of it anyhow, we don't need to create a special
1566 // conversion object
1567 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
1569 // don't convert at all
1573 // we trust OS to do conversion better than we can so try external
1574 // conversion methods first
1576 // the full order is:
1577 // 1. OS conversion (iconv() under Unix or Win32 API)
1578 // 2. hard coded conversions for UTF
1579 // 3. wxEncodingConverter as fall back
1585 #endif // !wxUSE_FONTMAP
1587 wxString
name(m_name
);
1591 name
= wxFontMapper::Get()->GetEncodingName(m_encoding
);
1592 #endif // wxUSE_FONTMAP
1594 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
1600 #endif // HAVE_ICONV
1602 #ifdef wxHAVE_WIN32_MB2WC
1604 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
1605 : new wxMBConv_win32(m_encoding
);
1611 #endif // wxHAVE_WIN32_MB2WC
1612 #if defined(__WXMAC__)
1614 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
) )
1617 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
1618 : new wxMBConv_mac(m_encoding
);
1627 wxFontEncoding enc
= m_encoding
;
1629 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
1631 // use "false" to suppress interactive dialogs -- we can be called from
1632 // anywhere and popping up a dialog from here is the last thing we want to
1634 enc
= wxFontMapper::Get()->CharsetToEncoding(m_name
, false);
1636 #endif // wxUSE_FONTMAP
1640 case wxFONTENCODING_UTF7
:
1641 return new wxMBConvUTF7
;
1643 case wxFONTENCODING_UTF8
:
1644 return new wxMBConvUTF8
;
1646 case wxFONTENCODING_UTF16BE
:
1647 return new wxMBConvUTF16BE
;
1649 case wxFONTENCODING_UTF16LE
:
1650 return new wxMBConvUTF16LE
;
1652 case wxFONTENCODING_UTF32BE
:
1653 return new wxMBConvUTF32BE
;
1655 case wxFONTENCODING_UTF32LE
:
1656 return new wxMBConvUTF32LE
;
1659 // nothing to do but put here to suppress gcc warnings
1666 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
1667 : new wxMBConv_wxwin(m_encoding
);
1673 #endif // wxUSE_FONTMAP
1675 // NB: This is a hack to prevent deadlock. What could otherwise happen
1676 // in Unicode build: wxConvLocal creation ends up being here
1677 // because of some failure and logs the error. But wxLog will try to
1678 // attach timestamp, for which it will need wxConvLocal (to convert
1679 // time to char* and then wchar_t*), but that fails, tries to log
1680 // error, but wxLog has a (already locked) critical section that
1681 // guards static buffer.
1682 static bool alreadyLoggingError
= false;
1683 if (!alreadyLoggingError
)
1685 alreadyLoggingError
= true;
1686 wxLogError(_("Cannot convert from the charset '%s'!"),
1690 wxFontMapper::GetEncodingDescription(m_encoding
).c_str()
1691 #else // !wxUSE_FONTMAP
1692 wxString::Format(_("encoding %s"), m_encoding
).c_str()
1693 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1695 alreadyLoggingError
= false;
1701 void wxCSConv::CreateConvIfNeeded() const
1705 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
1708 // if we don't have neither the name nor the encoding, use the default
1709 // encoding for this system
1710 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
1712 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
1714 #endif // wxUSE_INTL
1716 self
->m_convReal
= DoCreate();
1717 self
->m_deferred
= false;
1721 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1723 CreateConvIfNeeded();
1726 return m_convReal
->MB2WC(buf
, psz
, n
);
1729 size_t len
= strlen(psz
);
1733 for (size_t c
= 0; c
<= len
; c
++)
1734 buf
[c
] = (unsigned char)(psz
[c
]);
1740 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1742 CreateConvIfNeeded();
1745 return m_convReal
->WC2MB(buf
, psz
, n
);
1748 const size_t len
= wxWcslen(psz
);
1751 for (size_t c
= 0; c
<= len
; c
++)
1760 for (size_t c
= 0; c
<= len
; c
++)
1770 // ----------------------------------------------------------------------------
1772 // ----------------------------------------------------------------------------
1775 static wxMBConv_win32 wxConvLibcObj
;
1777 static wxMBConvLibc wxConvLibcObj
;
1780 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
1781 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
1782 static wxMBConvUTF7 wxConvUTF7Obj
;
1783 static wxMBConvUTF8 wxConvUTF8Obj
;
1786 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
1787 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
1788 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
1789 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
1790 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
1791 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
1793 #else // !wxUSE_WCHAR_T
1795 // stand-ins in absence of wchar_t
1796 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
1801 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T