1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
13 // ============================================================================
15 // ============================================================================
17 // ----------------------------------------------------------------------------
19 // ----------------------------------------------------------------------------
21 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
22 #pragma implementation "strconv.h"
25 // For compilers that support precompilation, includes "wx.h".
26 #include "wx/wxprec.h"
37 #include "wx/strconv.h"
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
54 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
55 #define wxHAVE_WIN32_MB2WC
56 #endif // __WIN32__ but !__WXMICROWIN__
58 // ----------------------------------------------------------------------------
60 // ----------------------------------------------------------------------------
70 #include "wx/encconv.h"
71 #include "wx/fontmap.h"
74 #include "ATSUnicode.h"
75 #include "TextCommon.h"
76 #include "TextEncodingConverter.h"
78 #include "wx/mac/private.h" // includes mac headers
80 // ----------------------------------------------------------------------------
82 // ----------------------------------------------------------------------------
84 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
85 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
87 #if SIZEOF_WCHAR_T == 4
88 #define WC_NAME "UCS4"
89 #define WC_BSWAP BSWAP_UCS4
90 #ifdef WORDS_BIGENDIAN
91 #define WC_NAME_BEST "UCS-4BE"
93 #define WC_NAME_BEST "UCS-4LE"
95 #elif SIZEOF_WCHAR_T == 2
96 #define WC_NAME "UTF16"
97 #define WC_BSWAP BSWAP_UTF16
99 #ifdef WORDS_BIGENDIAN
100 #define WC_NAME_BEST "UTF-16BE"
102 #define WC_NAME_BEST "UTF-16LE"
104 #else // sizeof(wchar_t) != 2 nor 4
105 // does this ever happen?
106 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
109 // ============================================================================
111 // ============================================================================
113 // ----------------------------------------------------------------------------
114 // UTF-16 en/decoding to/from UCS-4
115 // ----------------------------------------------------------------------------
118 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
123 *output
= (wxUint16
) input
;
126 else if (input
>=0x110000)
134 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
135 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
141 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
143 if ((*input
<0xd800) || (*input
>0xdfff))
148 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
155 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
161 // ----------------------------------------------------------------------------
163 // ----------------------------------------------------------------------------
165 wxMBConv::~wxMBConv()
167 // nothing to do here
170 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
174 // calculate the length of the buffer needed first
175 size_t nLen
= MB2WC(NULL
, psz
, 0);
176 if ( nLen
!= (size_t)-1 )
178 // now do the actual conversion
179 wxWCharBuffer
buf(nLen
);
180 MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NUL
186 wxWCharBuffer
buf((wchar_t *)NULL
);
191 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
195 size_t nLen
= WC2MB(NULL
, pwz
, 0);
196 if ( nLen
!= (size_t)-1 )
198 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
199 WC2MB(buf
.data(), pwz
, nLen
+ 4);
205 wxCharBuffer
buf((char *)NULL
);
210 // ----------------------------------------------------------------------------
212 // ----------------------------------------------------------------------------
214 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
216 return wxMB2WC(buf
, psz
, n
);
219 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
221 return wxWC2MB(buf
, psz
, n
);
224 // ----------------------------------------------------------------------------
226 // ----------------------------------------------------------------------------
229 static char utf7_setD
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
230 "abcdefghijklmnopqrstuvwxyz"
231 "0123456789'(),-./:?";
232 static char utf7_setO
[]="!\"#$%&*;<=>@[]^_`{|}";
233 static char utf7_setB
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
234 "abcdefghijklmnopqrstuvwxyz"
238 // TODO: write actual implementations of UTF-7 here
239 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf
),
240 const char * WXUNUSED(psz
),
241 size_t WXUNUSED(n
)) const
246 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf
),
247 const wchar_t * WXUNUSED(psz
),
248 size_t WXUNUSED(n
)) const
253 // ----------------------------------------------------------------------------
255 // ----------------------------------------------------------------------------
257 static wxUint32 utf8_max
[]=
258 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
260 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
264 while (*psz
&& ((!buf
) || (len
< n
)))
266 unsigned char cc
= *psz
++, fc
= cc
;
268 for (cnt
= 0; fc
& 0x80; cnt
++)
282 // invalid UTF-8 sequence
287 unsigned ocnt
= cnt
- 1;
288 wxUint32 res
= cc
& (0x3f >> cnt
);
292 if ((cc
& 0xC0) != 0x80)
294 // invalid UTF-8 sequence
297 res
= (res
<< 6) | (cc
& 0x3f);
299 if (res
<= utf8_max
[ocnt
])
301 // illegal UTF-8 encoding
305 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
306 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
307 if (pa
== (size_t)-1)
316 #endif // WC_UTF16/!WC_UTF16
320 if (buf
&& (len
< n
))
325 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
329 while (*psz
&& ((!buf
) || (len
< n
)))
333 // cast is ok for WC_UTF16
334 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
335 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
337 cc
=(*psz
++) & 0x7fffffff;
340 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
354 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
356 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
361 if (buf
&& (len
<n
)) *buf
= 0;
369 // ----------------------------------------------------------------------------
371 // ----------------------------------------------------------------------------
373 #ifdef WORDS_BIGENDIAN
374 #define wxMBConvUTF16straight wxMBConvUTF16BE
375 #define wxMBConvUTF16swap wxMBConvUTF16LE
377 #define wxMBConvUTF16swap wxMBConvUTF16BE
378 #define wxMBConvUTF16straight wxMBConvUTF16LE
384 // copy 16bit MB to 16bit String
385 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
389 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
392 *buf
++ = *(wxUint16
*)psz
;
395 psz
+= sizeof(wxUint16
);
397 if (buf
&& len
<n
) *buf
=0;
403 // copy 16bit String to 16bit MB
404 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
408 while (*psz
&& (!buf
|| len
< n
))
412 *(wxUint16
*)buf
= *psz
;
413 buf
+= sizeof(wxUint16
);
415 len
+= sizeof(wxUint16
);
418 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
424 // swap 16bit MB to 16bit String
425 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
429 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
433 ((char *)buf
)[0] = psz
[1];
434 ((char *)buf
)[1] = psz
[0];
438 psz
+= sizeof(wxUint16
);
440 if (buf
&& len
<n
) *buf
=0;
446 // swap 16bit MB to 16bit String
447 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
451 while (*psz
&& (!buf
|| len
< n
))
455 *buf
++ = ((char*)psz
)[1];
456 *buf
++ = ((char*)psz
)[0];
458 len
+= sizeof(wxUint16
);
461 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
470 // copy 16bit MB to 32bit String
471 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
475 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
478 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
479 if (pa
== (size_t)-1)
485 psz
+= pa
* sizeof(wxUint16
);
487 if (buf
&& len
<n
) *buf
=0;
493 // copy 32bit String to 16bit MB
494 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
498 while (*psz
&& (!buf
|| len
< n
))
501 size_t pa
=encode_utf16(*psz
, cc
);
503 if (pa
== (size_t)-1)
508 *(wxUint16
*)buf
= cc
[0];
509 buf
+= sizeof(wxUint16
);
512 *(wxUint16
*)buf
= cc
[1];
513 buf
+= sizeof(wxUint16
);
517 len
+= pa
*sizeof(wxUint16
);
520 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
526 // swap 16bit MB to 32bit String
527 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
531 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
535 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
536 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
538 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
539 if (pa
== (size_t)-1)
546 psz
+= pa
* sizeof(wxUint16
);
548 if (buf
&& len
<n
) *buf
=0;
554 // swap 32bit String to 16bit MB
555 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
559 while (*psz
&& (!buf
|| len
< n
))
562 size_t pa
=encode_utf16(*psz
, cc
);
564 if (pa
== (size_t)-1)
569 *buf
++ = ((char*)cc
)[1];
570 *buf
++ = ((char*)cc
)[0];
573 *buf
++ = ((char*)cc
)[3];
574 *buf
++ = ((char*)cc
)[2];
578 len
+= pa
*sizeof(wxUint16
);
581 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
589 // ----------------------------------------------------------------------------
591 // ----------------------------------------------------------------------------
593 #ifdef WORDS_BIGENDIAN
594 #define wxMBConvUTF32straight wxMBConvUTF32BE
595 #define wxMBConvUTF32swap wxMBConvUTF32LE
597 #define wxMBConvUTF32swap wxMBConvUTF32BE
598 #define wxMBConvUTF32straight wxMBConvUTF32LE
602 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
603 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
608 // copy 32bit MB to 16bit String
609 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
613 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
617 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
618 if (pa
== (size_t)-1)
628 psz
+= sizeof(wxUint32
);
630 if (buf
&& len
<n
) *buf
=0;
636 // copy 16bit String to 32bit MB
637 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
641 while (*psz
&& (!buf
|| len
< n
))
645 // cast is ok for WC_UTF16
646 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
647 if (pa
== (size_t)-1)
652 *(wxUint32
*)buf
= cc
;
653 buf
+= sizeof(wxUint32
);
655 len
+= sizeof(wxUint32
);
659 if (buf
&& len
<=n
-sizeof(wxUint32
))
667 // swap 32bit MB to 16bit String
668 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
672 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
675 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
676 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
681 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
682 if (pa
== (size_t)-1)
692 psz
+= sizeof(wxUint32
);
702 // swap 16bit String to 32bit MB
703 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
707 while (*psz
&& (!buf
|| len
< n
))
711 // cast is ok for WC_UTF16
712 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
713 if (pa
== (size_t)-1)
723 len
+= sizeof(wxUint32
);
727 if (buf
&& len
<=n
-sizeof(wxUint32
))
736 // copy 32bit MB to 32bit String
737 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
741 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
744 *buf
++ = *(wxUint32
*)psz
;
746 psz
+= sizeof(wxUint32
);
756 // copy 32bit String to 32bit MB
757 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
761 while (*psz
&& (!buf
|| len
< n
))
765 *(wxUint32
*)buf
= *psz
;
766 buf
+= sizeof(wxUint32
);
769 len
+= sizeof(wxUint32
);
773 if (buf
&& len
<=n
-sizeof(wxUint32
))
780 // swap 32bit MB to 32bit String
781 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
785 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
789 ((char *)buf
)[0] = psz
[3];
790 ((char *)buf
)[1] = psz
[2];
791 ((char *)buf
)[2] = psz
[1];
792 ((char *)buf
)[3] = psz
[0];
796 psz
+= sizeof(wxUint32
);
806 // swap 32bit String to 32bit MB
807 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
811 while (*psz
&& (!buf
|| len
< n
))
815 *buf
++ = ((char *)psz
)[3];
816 *buf
++ = ((char *)psz
)[2];
817 *buf
++ = ((char *)psz
)[1];
818 *buf
++ = ((char *)psz
)[0];
820 len
+= sizeof(wxUint32
);
824 if (buf
&& len
<=n
-sizeof(wxUint32
))
834 // ============================================================================
835 // The classes doing conversion using the iconv_xxx() functions
836 // ============================================================================
840 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
841 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
842 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
843 // (which means error) and says there are 0 bytes left in the input buffer --
844 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
845 // this alternative test for iconv() failure.
846 // [This bug does not appear in glibc 2.2.]
847 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
848 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
849 (errno != E2BIG || bufLeft != 0))
851 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
854 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
856 // ----------------------------------------------------------------------------
857 // wxMBConv_iconv: encapsulates an iconv character set
858 // ----------------------------------------------------------------------------
860 class wxMBConv_iconv
: public wxMBConv
863 wxMBConv_iconv(const wxChar
*name
);
864 virtual ~wxMBConv_iconv();
866 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
867 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
870 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
873 // the iconv handlers used to translate from multibyte to wide char and in
874 // the other direction
879 // the name (for iconv_open()) of a wide char charset -- if none is
880 // available on this machine, it will remain NULL
881 static const char *ms_wcCharsetName
;
883 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
884 // different endian-ness than the native one
885 static bool ms_wcNeedsSwap
;
888 const char *wxMBConv_iconv::ms_wcCharsetName
= NULL
;
889 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
891 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
893 // Do it the hard way
895 for (size_t i
= 0; i
< wxStrlen(name
)+1; i
++)
896 cname
[i
] = (char) name
[i
];
898 // check for charset that represents wchar_t:
899 if (ms_wcCharsetName
== NULL
)
901 ms_wcNeedsSwap
= false;
903 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
904 ms_wcCharsetName
= WC_NAME_BEST
;
905 m2w
= iconv_open(ms_wcCharsetName
, cname
);
907 if (m2w
== (iconv_t
)-1)
909 // try charset w/o bytesex info (e.g. "UCS4")
910 // and check for bytesex ourselves:
911 ms_wcCharsetName
= WC_NAME
;
912 m2w
= iconv_open(ms_wcCharsetName
, cname
);
914 // last bet, try if it knows WCHAR_T pseudo-charset
915 if (m2w
== (iconv_t
)-1)
917 ms_wcCharsetName
= "WCHAR_T";
918 m2w
= iconv_open(ms_wcCharsetName
, cname
);
921 if (m2w
!= (iconv_t
)-1)
923 char buf
[2], *bufPtr
;
924 wchar_t wbuf
[2], *wbufPtr
;
932 outsz
= SIZEOF_WCHAR_T
* 2;
936 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
937 (char**)&wbufPtr
, &outsz
);
939 if (ICONV_FAILED(res
, insz
))
941 ms_wcCharsetName
= NULL
;
942 wxLogLastError(wxT("iconv"));
943 wxLogError(_("Conversion to charset '%s' doesn't work."), name
);
947 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
952 ms_wcCharsetName
= NULL
;
954 // VS: we must not output an error here, since wxWindows will safely
955 // fall back to using wxEncodingConverter.
956 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name
);
960 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName
, ms_wcNeedsSwap
);
962 else // we already have ms_wcCharsetName
964 m2w
= iconv_open(ms_wcCharsetName
, cname
);
967 // NB: don't ever pass NULL to iconv_open(), it may crash!
968 if ( ms_wcCharsetName
)
970 w2m
= iconv_open( cname
, ms_wcCharsetName
);
978 wxMBConv_iconv::~wxMBConv_iconv()
980 if ( m2w
!= (iconv_t
)-1 )
982 if ( w2m
!= (iconv_t
)-1 )
986 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
988 size_t inbuf
= strlen(psz
);
989 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
991 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
992 wchar_t *bufPtr
= buf
;
993 const char *pszPtr
= psz
;
997 // have destination buffer, convert there
999 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1000 (char**)&bufPtr
, &outbuf
);
1001 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1005 // convert to native endianness
1006 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
1009 // NB: iconv was given only strlen(psz) characters on input, and so
1010 // it couldn't convert the trailing zero. Let's do it ourselves
1011 // if there's some room left for it in the output buffer.
1017 // no destination buffer... convert using temp buffer
1018 // to calculate destination buffer requirement
1023 outbuf
= 8*SIZEOF_WCHAR_T
;
1026 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1027 (char**)&bufPtr
, &outbuf
);
1029 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1030 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1033 if (ICONV_FAILED(cres
, inbuf
))
1035 //VS: it is ok if iconv fails, hence trace only
1036 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1043 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1045 size_t inbuf
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1049 wchar_t *tmpbuf
= 0;
1053 // need to copy to temp buffer to switch endianness
1054 // this absolutely doesn't rock!
1055 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1056 // could be in read-only memory, or be accessed in some other thread)
1057 tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
1058 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
1059 WC_BSWAP(tmpbuf
, inbuf
)
1065 // have destination buffer, convert there
1066 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1070 // NB: iconv was given only wcslen(psz) characters on input, and so
1071 // it couldn't convert the trailing zero. Let's do it ourselves
1072 // if there's some room left for it in the output buffer.
1078 // no destination buffer... convert using temp buffer
1079 // to calculate destination buffer requirement
1083 buf
= tbuf
; outbuf
= 16;
1085 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1088 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1096 if (ICONV_FAILED(cres
, inbuf
))
1098 //VS: it is ok if iconv fails, hence trace only
1099 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1106 #endif // HAVE_ICONV
1109 // ============================================================================
1110 // Win32 conversion classes
1111 // ============================================================================
1113 #ifdef wxHAVE_WIN32_MB2WC
1116 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1117 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1119 class wxMBConv_win32
: public wxMBConv
1124 m_CodePage
= CP_ACP
;
1127 wxMBConv_win32(const wxChar
* name
)
1129 m_CodePage
= wxCharsetToCodepage(name
);
1132 wxMBConv_win32(wxFontEncoding encoding
)
1134 m_CodePage
= wxEncodingToCodepage(encoding
);
1137 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1139 const size_t len
= ::MultiByteToWideChar
1141 m_CodePage
, // code page
1143 psz
, // input string
1144 -1, // its length (NUL-terminated)
1145 buf
, // output string
1146 buf
? n
: 0 // size of output buffer
1149 // note that it returns count of written chars for buf != NULL and size
1150 // of the needed buffer for buf == NULL so in either case the length of
1151 // the string (which never includes the terminating NUL) is one less
1152 return len
? len
- 1 : (size_t)-1;
1155 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1158 we have a problem here: by default, WideCharToMultiByte() may
1159 replace characters unrepresentable in the target code page with bad
1160 quality approximations such as turning "1/2" symbol (U+00BD) into
1161 "1" for the code pages which don't have it and we, obviously, want
1162 to avoid this at any price
1164 the trouble is that this function does it _silently_, i.e. it won't
1165 even tell us whether it did or not... Win98/2000 and higher provide
1166 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1167 we have to resort to a round trip, i.e. check that converting back
1168 results in the same string -- this is, of course, expensive but
1169 otherwise we simply can't be sure to not garble the data.
1172 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1173 // it doesn't work with CJK encodings (which we test for rather roughly
1174 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1176 BOOL usedDef
wxDUMMY_INITIALIZE(false),
1179 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1181 // it's our lucky day
1182 flags
= WC_NO_BEST_FIT_CHARS
;
1183 pUsedDef
= &usedDef
;
1185 else // old system or unsupported encoding
1191 const size_t len
= ::WideCharToMultiByte
1193 m_CodePage
, // code page
1194 flags
, // either none or no best fit
1195 pwz
, // input string
1196 -1, // it is (wide) NUL-terminated
1197 buf
, // output buffer
1198 buf
? n
: 0, // and its size
1199 NULL
, // default "replacement" char
1200 pUsedDef
// [out] was it used?
1205 // function totally failed
1209 // if we were really converting, check if we succeeded
1214 // check if the conversion failed, i.e. if any replacements
1219 else // we must resort to double tripping...
1221 wxWCharBuffer
wcBuf(n
);
1222 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1223 wcscmp(wcBuf
, pwz
) != 0 )
1225 // we didn't obtain the same thing we started from, hence
1226 // the conversion was lossy and we consider that it failed
1232 // see the comment above for the reason of "len - 1"
1236 bool IsOk() const { return m_CodePage
!= -1; }
1239 static bool CanUseNoBestFit()
1241 static int s_isWin98Or2k
= -1;
1243 if ( s_isWin98Or2k
== -1 )
1246 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1249 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1253 s_isWin98Or2k
= verMaj
>= 5;
1257 // unknown, be conseravtive by default
1261 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1264 return s_isWin98Or2k
== 1;
1270 #endif // wxHAVE_WIN32_MB2WC
1272 // ============================================================================
1273 // Mac conversion classes
1274 // ============================================================================
1276 #if defined(__WXMAC__) && defined(TARGET_CARBON)
1278 class wxMBConv_mac
: public wxMBConv
1283 Init(CFStringGetSystemEncoding()) ;
1286 wxMBConv_mac(const wxChar
* name
)
1288 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, FALSE
) ) ) ;
1291 wxMBConv_mac(wxFontEncoding encoding
)
1293 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
1298 OSStatus status
= noErr
;
1299 status
= TECDisposeConverter(m_MB2WC_converter
);
1300 status
= TECDisposeConverter(m_WC2MB_converter
);
1304 void Init( TextEncodingBase encoding
)
1306 OSStatus status
= noErr
;
1307 m_char_encoding
= encoding
;
1308 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
1310 status
= TECCreateConverter(&m_MB2WC_converter
,
1312 m_unicode_encoding
);
1313 status
= TECCreateConverter(&m_WC2MB_converter
,
1318 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1320 OSStatus status
= noErr
;
1321 ByteCount byteOutLen
;
1322 ByteCount byteInLen
= strlen(psz
) ;
1323 wchar_t *tbuf
= NULL
;
1324 UniChar
* ubuf
= NULL
;
1330 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
1332 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
1333 #if SIZEOF_WCHAR_T == 4
1334 ubuf
= (UniChar
*) malloc( byteBufferLen
) ;
1336 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
1338 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
1339 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
1340 #if SIZEOF_WCHAR_T == 4
1341 wxMBConvUTF16BE converter
;
1342 res
= converter
.MB2WC( (buf
? buf
: tbuf
) , (const char*)ubuf
, n
) ;
1345 res
= byteOutLen
/ sizeof( UniChar
) ;
1350 if ( buf
&& res
< n
)
1356 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1358 OSStatus status
= noErr
;
1359 ByteCount byteOutLen
;
1360 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1368 tbuf
= (char*) malloc( n
) ;
1371 ByteCount byteBufferLen
= n
;
1372 UniChar
* ubuf
= NULL
;
1373 #if SIZEOF_WCHAR_T == 4
1374 wxMBConvUTF16BE converter
;
1375 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
1376 byteBufferLen
= unicharlen
;
1377 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
1378 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
) ;
1380 ubuf
= (UniChar
*) psz
;
1382 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
1383 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
1384 #if SIZEOF_WCHAR_T == 4
1390 size_t res
= byteOutLen
;
1391 if ( buf
&& res
< n
)
1398 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
1401 TECObjectRef m_MB2WC_converter
;
1402 TECObjectRef m_WC2MB_converter
;
1404 TextEncodingBase m_char_encoding
;
1405 TextEncodingBase m_unicode_encoding
;
1408 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1410 // ============================================================================
1411 // wxEncodingConverter based conversion classes
1412 // ============================================================================
1416 class wxMBConv_wxwin
: public wxMBConv
1421 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
1422 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
1426 // temporarily just use wxEncodingConverter stuff,
1427 // so that it works while a better implementation is built
1428 wxMBConv_wxwin(const wxChar
* name
)
1431 m_enc
= wxFontMapper::Get()->CharsetToEncoding(name
, false);
1433 m_enc
= wxFONTENCODING_SYSTEM
;
1438 wxMBConv_wxwin(wxFontEncoding enc
)
1445 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
1447 size_t inbuf
= strlen(psz
);
1449 m2w
.Convert(psz
,buf
);
1453 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
1455 const size_t inbuf
= wxWcslen(psz
);
1457 w2m
.Convert(psz
,buf
);
1462 bool IsOk() const { return m_ok
; }
1465 wxFontEncoding m_enc
;
1466 wxEncodingConverter m2w
, w2m
;
1468 // were we initialized successfully?
1471 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
1474 #endif // wxUSE_FONTMAP
1476 // ============================================================================
1477 // wxCSConv implementation
1478 // ============================================================================
1480 void wxCSConv::Init()
1487 wxCSConv::wxCSConv(const wxChar
*charset
)
1496 m_encoding
= wxFONTENCODING_SYSTEM
;
1499 wxCSConv::wxCSConv(wxFontEncoding encoding
)
1501 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
1503 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
1505 encoding
= wxFONTENCODING_SYSTEM
;
1510 m_encoding
= encoding
;
1513 wxCSConv::~wxCSConv()
1518 wxCSConv::wxCSConv(const wxCSConv
& conv
)
1523 SetName(conv
.m_name
);
1524 m_encoding
= conv
.m_encoding
;
1527 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
1531 SetName(conv
.m_name
);
1532 m_encoding
= conv
.m_encoding
;
1537 void wxCSConv::Clear()
1546 void wxCSConv::SetName(const wxChar
*charset
)
1550 m_name
= wxStrdup(charset
);
1555 wxMBConv
*wxCSConv::DoCreate() const
1557 // check for the special case of ASCII or ISO8859-1 charset: as we have
1558 // special knowledge of it anyhow, we don't need to create a special
1559 // conversion object
1560 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
1562 // don't convert at all
1566 // we trust OS to do conversion better than we can so try external
1567 // conversion methods first
1569 // the full order is:
1570 // 1. OS conversion (iconv() under Unix or Win32 API)
1571 // 2. hard coded conversions for UTF
1572 // 3. wxEncodingConverter as fall back
1578 #endif // !wxUSE_FONTMAP
1580 wxString
name(m_name
);
1584 name
= wxFontMapper::Get()->GetEncodingName(m_encoding
);
1585 #endif // wxUSE_FONTMAP
1587 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
1593 #endif // HAVE_ICONV
1595 #ifdef wxHAVE_WIN32_MB2WC
1597 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
1598 : new wxMBConv_win32(m_encoding
);
1604 #endif // wxHAVE_WIN32_MB2WC
1605 #if defined(__WXMAC__)
1607 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
) )
1610 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
1611 : new wxMBConv_mac(m_encoding
);
1620 wxFontEncoding enc
= m_encoding
;
1622 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
1624 // use "false" to suppress interactive dialogs -- we can be called from
1625 // anywhere and popping up a dialog from here is the last thing we want to
1627 enc
= wxFontMapper::Get()->CharsetToEncoding(m_name
, false);
1629 #endif // wxUSE_FONTMAP
1633 case wxFONTENCODING_UTF7
:
1634 return new wxMBConvUTF7
;
1636 case wxFONTENCODING_UTF8
:
1637 return new wxMBConvUTF8
;
1639 case wxFONTENCODING_UTF16BE
:
1640 return new wxMBConvUTF16BE
;
1642 case wxFONTENCODING_UTF16LE
:
1643 return new wxMBConvUTF16LE
;
1645 case wxFONTENCODING_UTF32BE
:
1646 return new wxMBConvUTF32BE
;
1648 case wxFONTENCODING_UTF32LE
:
1649 return new wxMBConvUTF32LE
;
1652 // nothing to do but put here to suppress gcc warnings
1659 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
1660 : new wxMBConv_wxwin(m_encoding
);
1666 #endif // wxUSE_FONTMAP
1668 // NB: This is a hack to prevent deadlock. What could otherwise happen
1669 // in Unicode build: wxConvLocal creation ends up being here
1670 // because of some failure and logs the error. But wxLog will try to
1671 // attach timestamp, for which it will need wxConvLocal (to convert
1672 // time to char* and then wchar_t*), but that fails, tries to log
1673 // error, but wxLog has a (already locked) critical section that
1674 // guards static buffer.
1675 static bool alreadyLoggingError
= false;
1676 if (!alreadyLoggingError
)
1678 alreadyLoggingError
= true;
1679 wxLogError(_("Cannot convert from the charset '%s'!"),
1683 wxFontMapper::GetEncodingDescription(m_encoding
).c_str()
1684 #else // !wxUSE_FONTMAP
1685 wxString::Format(_("encoding %s"), m_encoding
).c_str()
1686 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1688 alreadyLoggingError
= false;
1694 void wxCSConv::CreateConvIfNeeded() const
1698 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
1701 // if we don't have neither the name nor the encoding, use the default
1702 // encoding for this system
1703 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
1705 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
1707 #endif // wxUSE_INTL
1709 self
->m_convReal
= DoCreate();
1710 self
->m_deferred
= false;
1714 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1716 CreateConvIfNeeded();
1719 return m_convReal
->MB2WC(buf
, psz
, n
);
1722 size_t len
= strlen(psz
);
1726 for (size_t c
= 0; c
<= len
; c
++)
1727 buf
[c
] = (unsigned char)(psz
[c
]);
1733 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1735 CreateConvIfNeeded();
1738 return m_convReal
->WC2MB(buf
, psz
, n
);
1741 const size_t len
= wxWcslen(psz
);
1744 for (size_t c
= 0; c
<= len
; c
++)
1753 for (size_t c
= 0; c
<= len
; c
++)
1763 // ----------------------------------------------------------------------------
1765 // ----------------------------------------------------------------------------
1768 static wxMBConv_win32 wxConvLibcObj
;
1770 static wxMBConvLibc wxConvLibcObj
;
1773 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
1774 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
1775 static wxMBConvUTF7 wxConvUTF7Obj
;
1776 static wxMBConvUTF8 wxConvUTF8Obj
;
1779 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
1780 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
1781 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
1782 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
1783 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
1784 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
1786 #else // !wxUSE_WCHAR_T
1788 // stand-ins in absence of wchar_t
1789 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
1794 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T