1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
13 // ============================================================================
15 // ============================================================================
17 // ----------------------------------------------------------------------------
19 // ----------------------------------------------------------------------------
21 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
22 #pragma implementation "strconv.h"
25 // For compilers that support precompilation, includes "wx.h".
26 #include "wx/wxprec.h"
37 #include "wx/strconv.h"
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
54 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
55 #define wxHAVE_WIN32_MB2WC
56 #endif // __WIN32__ but !__WXMICROWIN__
58 // ----------------------------------------------------------------------------
60 // ----------------------------------------------------------------------------
70 #include "wx/encconv.h"
71 #include "wx/fontmap.h"
74 #include "ATSUnicode.h"
75 #include "TextCommon.h"
76 #include "TextEncodingConverter.h"
78 #include "wx/mac/private.h" // includes mac headers
80 // ----------------------------------------------------------------------------
82 // ----------------------------------------------------------------------------
84 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
85 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
87 #if SIZEOF_WCHAR_T == 4
88 #define WC_NAME "UCS4"
89 #define WC_BSWAP BSWAP_UCS4
90 #ifdef WORDS_BIGENDIAN
91 #define WC_NAME_BEST "UCS-4BE"
93 #define WC_NAME_BEST "UCS-4LE"
95 #elif SIZEOF_WCHAR_T == 2
96 #define WC_NAME "UTF16"
97 #define WC_BSWAP BSWAP_UTF16
99 #ifdef WORDS_BIGENDIAN
100 #define WC_NAME_BEST "UTF-16BE"
102 #define WC_NAME_BEST "UTF-16LE"
104 #else // sizeof(wchar_t) != 2 nor 4
105 // does this ever happen?
106 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
109 // ============================================================================
111 // ============================================================================
113 // ----------------------------------------------------------------------------
114 // UTF-16 en/decoding to/from UCS-4
115 // ----------------------------------------------------------------------------
118 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
123 *output
= (wxUint16
) input
;
126 else if (input
>=0x110000)
134 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
135 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
141 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
143 if ((*input
<0xd800) || (*input
>0xdfff))
148 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
155 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
161 // ----------------------------------------------------------------------------
163 // ----------------------------------------------------------------------------
165 wxMBConv::~wxMBConv()
167 // nothing to do here
170 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
174 // calculate the length of the buffer needed first
175 size_t nLen
= MB2WC(NULL
, psz
, 0);
176 if ( nLen
!= (size_t)-1 )
178 // now do the actual conversion
179 wxWCharBuffer
buf(nLen
);
180 MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NUL
186 wxWCharBuffer
buf((wchar_t *)NULL
);
191 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
195 size_t nLen
= WC2MB(NULL
, pwz
, 0);
196 if ( nLen
!= (size_t)-1 )
198 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
199 WC2MB(buf
.data(), pwz
, nLen
+ 4);
205 wxCharBuffer
buf((char *)NULL
);
210 // ----------------------------------------------------------------------------
212 // ----------------------------------------------------------------------------
214 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
216 return wxMB2WC(buf
, psz
, n
);
219 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
221 return wxWC2MB(buf
, psz
, n
);
224 // ----------------------------------------------------------------------------
226 // ----------------------------------------------------------------------------
229 static char utf7_setD
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
230 "abcdefghijklmnopqrstuvwxyz"
231 "0123456789'(),-./:?";
232 static char utf7_setO
[]="!\"#$%&*;<=>@[]^_`{|}";
233 static char utf7_setB
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
234 "abcdefghijklmnopqrstuvwxyz"
238 // TODO: write actual implementations of UTF-7 here
239 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf
),
240 const char * WXUNUSED(psz
),
241 size_t WXUNUSED(n
)) const
246 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf
),
247 const wchar_t * WXUNUSED(psz
),
248 size_t WXUNUSED(n
)) const
253 // ----------------------------------------------------------------------------
255 // ----------------------------------------------------------------------------
257 static wxUint32 utf8_max
[]=
258 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
260 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
264 while (*psz
&& ((!buf
) || (len
< n
)))
266 unsigned char cc
= *psz
++, fc
= cc
;
268 for (cnt
= 0; fc
& 0x80; cnt
++)
282 // invalid UTF-8 sequence
287 unsigned ocnt
= cnt
- 1;
288 wxUint32 res
= cc
& (0x3f >> cnt
);
292 if ((cc
& 0xC0) != 0x80)
294 // invalid UTF-8 sequence
297 res
= (res
<< 6) | (cc
& 0x3f);
299 if (res
<= utf8_max
[ocnt
])
301 // illegal UTF-8 encoding
305 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
306 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
307 if (pa
== (size_t)-1)
316 #endif // WC_UTF16/!WC_UTF16
320 if (buf
&& (len
< n
))
325 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
329 while (*psz
&& ((!buf
) || (len
< n
)))
333 // cast is ok for WC_UTF16
334 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
335 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
337 cc
=(*psz
++) & 0x7fffffff;
340 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
354 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
356 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
361 if (buf
&& (len
<n
)) *buf
= 0;
369 // ----------------------------------------------------------------------------
371 // ----------------------------------------------------------------------------
373 #ifdef WORDS_BIGENDIAN
374 #define wxMBConvUTF16straight wxMBConvUTF16BE
375 #define wxMBConvUTF16swap wxMBConvUTF16LE
377 #define wxMBConvUTF16swap wxMBConvUTF16BE
378 #define wxMBConvUTF16straight wxMBConvUTF16LE
384 // copy 16bit MB to 16bit String
385 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
389 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
392 *buf
++ = *(wxUint16
*)psz
;
395 psz
+= sizeof(wxUint16
);
397 if (buf
&& len
<n
) *buf
=0;
403 // copy 16bit String to 16bit MB
404 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
408 while (*psz
&& (!buf
|| len
< n
))
412 *(wxUint16
*)buf
= *psz
;
413 buf
+= sizeof(wxUint16
);
415 len
+= sizeof(wxUint16
);
418 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
424 // swap 16bit MB to 16bit String
425 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
429 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
433 ((char *)buf
)[0] = psz
[1];
434 ((char *)buf
)[1] = psz
[0];
438 psz
+= sizeof(wxUint16
);
440 if (buf
&& len
<n
) *buf
=0;
446 // swap 16bit MB to 16bit String
447 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
451 while (*psz
&& (!buf
|| len
< n
))
455 *buf
++ = ((char*)psz
)[1];
456 *buf
++ = ((char*)psz
)[0];
458 len
+= sizeof(wxUint16
);
461 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
470 // copy 16bit MB to 32bit String
471 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
475 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
478 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
479 if (pa
== (size_t)-1)
485 psz
+= pa
* sizeof(wxUint16
);
487 if (buf
&& len
<n
) *buf
=0;
493 // copy 32bit String to 16bit MB
494 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
498 while (*psz
&& (!buf
|| len
< n
))
501 size_t pa
=encode_utf16(*psz
, cc
);
503 if (pa
== (size_t)-1)
508 *(wxUint16
*)buf
= cc
[0];
509 buf
+= sizeof(wxUint16
);
512 *(wxUint16
*)buf
= cc
[1];
513 buf
+= sizeof(wxUint16
);
517 len
+= pa
*sizeof(wxUint16
);
520 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
526 // swap 16bit MB to 32bit String
527 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
531 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
535 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
536 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
538 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
539 if (pa
== (size_t)-1)
546 psz
+= pa
* sizeof(wxUint16
);
548 if (buf
&& len
<n
) *buf
=0;
554 // swap 32bit String to 16bit MB
555 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
559 while (*psz
&& (!buf
|| len
< n
))
562 size_t pa
=encode_utf16(*psz
, cc
);
564 if (pa
== (size_t)-1)
569 *buf
++ = ((char*)cc
)[1];
570 *buf
++ = ((char*)cc
)[0];
573 *buf
++ = ((char*)cc
)[3];
574 *buf
++ = ((char*)cc
)[2];
578 len
+= pa
*sizeof(wxUint16
);
581 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
589 // ----------------------------------------------------------------------------
591 // ----------------------------------------------------------------------------
593 #ifdef WORDS_BIGENDIAN
594 #define wxMBConvUTF32straight wxMBConvUTF32BE
595 #define wxMBConvUTF32swap wxMBConvUTF32LE
597 #define wxMBConvUTF32swap wxMBConvUTF32BE
598 #define wxMBConvUTF32straight wxMBConvUTF32LE
602 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
603 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
608 // copy 32bit MB to 16bit String
609 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
613 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
617 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
618 if (pa
== (size_t)-1)
628 psz
+= sizeof(wxUint32
);
630 if (buf
&& len
<n
) *buf
=0;
636 // copy 16bit String to 32bit MB
637 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
641 while (*psz
&& (!buf
|| len
< n
))
645 // cast is ok for WC_UTF16
646 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
647 if (pa
== (size_t)-1)
652 *(wxUint32
*)buf
= cc
;
653 buf
+= sizeof(wxUint32
);
655 len
+= sizeof(wxUint32
);
659 if (buf
&& len
<=n
-sizeof(wxUint32
))
667 // swap 32bit MB to 16bit String
668 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
672 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
675 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
676 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
681 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
682 if (pa
== (size_t)-1)
692 psz
+= sizeof(wxUint32
);
702 // swap 16bit String to 32bit MB
703 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
707 while (*psz
&& (!buf
|| len
< n
))
711 // cast is ok for WC_UTF16
712 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
713 if (pa
== (size_t)-1)
723 len
+= sizeof(wxUint32
);
727 if (buf
&& len
<=n
-sizeof(wxUint32
))
736 // copy 32bit MB to 32bit String
737 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
741 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
744 *buf
++ = *(wxUint32
*)psz
;
746 psz
+= sizeof(wxUint32
);
756 // copy 32bit String to 32bit MB
757 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
761 while (*psz
&& (!buf
|| len
< n
))
765 *(wxUint32
*)buf
= *psz
;
766 buf
+= sizeof(wxUint32
);
769 len
+= sizeof(wxUint32
);
773 if (buf
&& len
<=n
-sizeof(wxUint32
))
780 // swap 32bit MB to 32bit String
781 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
785 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
789 ((char *)buf
)[0] = psz
[3];
790 ((char *)buf
)[1] = psz
[2];
791 ((char *)buf
)[2] = psz
[1];
792 ((char *)buf
)[3] = psz
[0];
796 psz
+= sizeof(wxUint32
);
806 // swap 32bit String to 32bit MB
807 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
811 while (*psz
&& (!buf
|| len
< n
))
815 *buf
++ = ((char *)psz
)[3];
816 *buf
++ = ((char *)psz
)[2];
817 *buf
++ = ((char *)psz
)[1];
818 *buf
++ = ((char *)psz
)[0];
820 len
+= sizeof(wxUint32
);
824 if (buf
&& len
<=n
-sizeof(wxUint32
))
834 // ============================================================================
835 // The classes doing conversion using the iconv_xxx() functions
836 // ============================================================================
840 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
841 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
842 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
843 // (which means error) and says there are 0 bytes left in the input buffer --
844 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
845 // this alternative test for iconv() failure.
846 // [This bug does not appear in glibc 2.2.]
847 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
848 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
849 (errno != E2BIG || bufLeft != 0))
851 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
854 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
856 // ----------------------------------------------------------------------------
857 // wxMBConv_iconv: encapsulates an iconv character set
858 // ----------------------------------------------------------------------------
860 class wxMBConv_iconv
: public wxMBConv
863 wxMBConv_iconv(const wxChar
*name
);
864 virtual ~wxMBConv_iconv();
866 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
867 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
870 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
873 // the iconv handlers used to translate from multibyte to wide char and in
874 // the other direction
879 // the name (for iconv_open()) of a wide char charset -- if none is
880 // available on this machine, it will remain NULL
881 static const char *ms_wcCharsetName
;
883 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
884 // different endian-ness than the native one
885 static bool ms_wcNeedsSwap
;
888 const char *wxMBConv_iconv::ms_wcCharsetName
= NULL
;
889 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
891 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
893 // Do it the hard way
895 for (size_t i
= 0; i
< wxStrlen(name
)+1; i
++)
896 cname
[i
] = (char) name
[i
];
898 // check for charset that represents wchar_t:
899 if (ms_wcCharsetName
== NULL
)
901 ms_wcNeedsSwap
= false;
903 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
904 ms_wcCharsetName
= WC_NAME_BEST
;
905 m2w
= iconv_open(ms_wcCharsetName
, cname
);
907 if (m2w
== (iconv_t
)-1)
909 // try charset w/o bytesex info (e.g. "UCS4")
910 // and check for bytesex ourselves:
911 ms_wcCharsetName
= WC_NAME
;
912 m2w
= iconv_open(ms_wcCharsetName
, cname
);
914 // last bet, try if it knows WCHAR_T pseudo-charset
915 if (m2w
== (iconv_t
)-1)
917 ms_wcCharsetName
= "WCHAR_T";
918 m2w
= iconv_open(ms_wcCharsetName
, cname
);
921 if (m2w
!= (iconv_t
)-1)
923 char buf
[2], *bufPtr
;
924 wchar_t wbuf
[2], *wbufPtr
;
932 outsz
= SIZEOF_WCHAR_T
* 2;
936 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
937 (char**)&wbufPtr
, &outsz
);
939 if (ICONV_FAILED(res
, insz
))
941 ms_wcCharsetName
= NULL
;
942 wxLogLastError(wxT("iconv"));
943 wxLogError(_("Conversion to charset '%s' doesn't work."), name
);
947 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
952 ms_wcCharsetName
= NULL
;
954 // VS: we must not output an error here, since wxWindows will safely
955 // fall back to using wxEncodingConverter.
956 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name
);
960 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName
, ms_wcNeedsSwap
);
962 else // we already have ms_wcCharsetName
964 m2w
= iconv_open(ms_wcCharsetName
, cname
);
967 // NB: don't ever pass NULL to iconv_open(), it may crash!
968 if ( ms_wcCharsetName
)
970 w2m
= iconv_open( cname
, ms_wcCharsetName
);
978 wxMBConv_iconv::~wxMBConv_iconv()
980 if ( m2w
!= (iconv_t
)-1 )
982 if ( w2m
!= (iconv_t
)-1 )
986 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
988 size_t inbuf
= strlen(psz
);
989 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
991 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
992 wchar_t *bufPtr
= buf
;
993 const char *pszPtr
= psz
;
997 // have destination buffer, convert there
999 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1000 (char**)&bufPtr
, &outbuf
);
1001 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1005 // convert to native endianness
1006 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
1009 // NB: iconv was given only strlen(psz) characters on input, and so
1010 // it couldn't convert the trailing zero. Let's do it ourselves
1011 // if there's some room left for it in the output buffer.
1017 // no destination buffer... convert using temp buffer
1018 // to calculate destination buffer requirement
1023 outbuf
= 8*SIZEOF_WCHAR_T
;
1026 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1027 (char**)&bufPtr
, &outbuf
);
1029 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1030 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1033 if (ICONV_FAILED(cres
, inbuf
))
1035 //VS: it is ok if iconv fails, hence trace only
1036 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1043 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1045 size_t inbuf
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1049 wchar_t *tmpbuf
= 0;
1053 // need to copy to temp buffer to switch endianness
1054 // this absolutely doesn't rock!
1055 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1056 // could be in read-only memory, or be accessed in some other thread)
1057 tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
1058 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
1059 WC_BSWAP(tmpbuf
, inbuf
)
1065 // have destination buffer, convert there
1066 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1070 // NB: iconv was given only wcslen(psz) characters on input, and so
1071 // it couldn't convert the trailing zero. Let's do it ourselves
1072 // if there's some room left for it in the output buffer.
1078 // no destination buffer... convert using temp buffer
1079 // to calculate destination buffer requirement
1083 buf
= tbuf
; outbuf
= 16;
1085 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1088 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1096 if (ICONV_FAILED(cres
, inbuf
))
1098 //VS: it is ok if iconv fails, hence trace only
1099 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1106 #endif // HAVE_ICONV
1109 // ============================================================================
1110 // Win32 conversion classes
1111 // ============================================================================
1113 #ifdef wxHAVE_WIN32_MB2WC
1116 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1117 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1119 class wxMBConv_win32
: public wxMBConv
1124 m_CodePage
= CP_ACP
;
1127 wxMBConv_win32(const wxChar
* name
)
1129 m_CodePage
= wxCharsetToCodepage(name
);
1132 wxMBConv_win32(wxFontEncoding encoding
)
1134 m_CodePage
= wxEncodingToCodepage(encoding
);
1137 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1139 const size_t len
= ::MultiByteToWideChar
1141 m_CodePage
, // code page
1143 psz
, // input string
1144 -1, // its length (NUL-terminated)
1145 buf
, // output string
1146 buf
? n
: 0 // size of output buffer
1149 // note that it returns count of written chars for buf != NULL and size
1150 // of the needed buffer for buf == NULL so in either case the length of
1151 // the string (which never includes the terminating NUL) is one less
1152 return len
? len
- 1 : (size_t)-1;
1155 size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
1158 we have a problem here: by default, WideCharToMultiByte() may
1159 replace characters unrepresentable in the target code page with bad
1160 quality approximations such as turning "1/2" symbol (U+00BD) into
1161 "1" for the code pages which don't have it and we, obviously, want
1162 to avoid this at any price
1164 the trouble is that this function does it _silently_, i.e. it won't
1165 even tell us whether it did or not... Win98/2000 and higher provide
1166 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1167 we have to resort to a round trip, i.e. check that converting back
1168 results in the same string -- this is, of course, expensive but
1169 otherwise we simply can't be sure to not garble the data.
1172 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1173 // it doesn't work with CJK encodings (which we test for rather roughly
1174 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1176 BOOL usedDef
wxDUMMY_INITIALIZE(false),
1179 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
1181 // it's our lucky day
1182 flags
= WC_NO_BEST_FIT_CHARS
;
1183 pUsedDef
= &usedDef
;
1185 else // old system or unsupported encoding
1191 const size_t len
= ::WideCharToMultiByte
1193 m_CodePage
, // code page
1194 flags
, // either none or no best fit
1195 pwz
, // input string
1196 -1, // it is (wide) NUL-terminated
1197 buf
, // output buffer
1198 buf
? n
: 0, // and its size
1199 NULL
, // default "replacement" char
1200 pUsedDef
// [out] was it used?
1205 // function totally failed
1209 // if we were really converting, check if we succeeded
1214 // check if the conversion failed, i.e. if any replacements
1219 else // we must resort to double tripping...
1221 wxWCharBuffer
wcBuf(n
);
1222 if ( MB2WC(wcBuf
.data(), buf
, n
) == (size_t)-1 ||
1223 wcscmp(wcBuf
, pwz
) != 0 )
1225 // we didn't obtain the same thing we started from, hence
1226 // the conversion was lossy and we consider that it failed
1232 // see the comment above for the reason of "len - 1"
1236 bool IsOk() const { return m_CodePage
!= -1; }
1239 static bool CanUseNoBestFit()
1241 static int s_isWin98Or2k
= -1;
1243 if ( s_isWin98Or2k
== -1 )
1246 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
1249 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
1253 s_isWin98Or2k
= verMaj
>= 5;
1257 // unknown, be conseravtive by default
1261 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
1264 return s_isWin98Or2k
== 1;
1270 #endif // wxHAVE_WIN32_MB2WC
1272 // ============================================================================
1273 // Mac conversion classes
1274 // ============================================================================
1276 #if defined(__WXMAC__) && defined(TARGET_CARBON)
1278 class wxMBConv_mac
: public wxMBConv
1283 Init(CFStringGetSystemEncoding()) ;
1286 wxMBConv_mac(const wxChar
* name
)
1288 Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name
, FALSE
) ) ) ;
1291 wxMBConv_mac(wxFontEncoding encoding
)
1293 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
1298 OSStatus status
= noErr
;
1299 status
= TECDisposeConverter(m_MB2WC_converter
);
1300 status
= TECDisposeConverter(m_WC2MB_converter
);
1304 void Init( TextEncodingBase encoding
)
1306 OSStatus status
= noErr
;
1307 m_char_encoding
= encoding
;
1308 #if SIZEOF_WCHAR_T == 4
1309 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode32BitFormat
) ;
1311 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
1313 status
= TECCreateConverter(&m_MB2WC_converter
,
1315 m_unicode_encoding
);
1316 status
= TECCreateConverter(&m_WC2MB_converter
,
1321 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1323 OSStatus status
= noErr
;
1324 ByteCount byteOutLen
;
1325 ByteCount byteInLen
= strlen(psz
) ;
1326 wchar_t *tbuf
= NULL
;
1331 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
1334 ByteCount byteBufferLen
= n
* SIZEOF_WCHAR_T
;
1335 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
1336 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
1341 size_t res
= byteOutLen
/ SIZEOF_WCHAR_T
;
1342 if ( buf
&& res
< n
)
1348 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1350 OSStatus status
= noErr
;
1351 ByteCount byteOutLen
;
1352 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1360 tbuf
= (char*) malloc( n
) ;
1363 ByteCount byteBufferLen
= n
;
1364 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
1365 (TextPtr
) ( buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
1370 size_t res
= byteOutLen
;
1371 if ( buf
&& res
< n
)
1378 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
1381 TECObjectRef m_MB2WC_converter
;
1382 TECObjectRef m_WC2MB_converter
;
1384 TextEncodingBase m_char_encoding
;
1385 TextEncodingBase m_unicode_encoding
;
1388 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1390 // ============================================================================
1391 // wxEncodingConverter based conversion classes
1392 // ============================================================================
1396 class wxMBConv_wxwin
: public wxMBConv
1401 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
1402 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
1406 // temporarily just use wxEncodingConverter stuff,
1407 // so that it works while a better implementation is built
1408 wxMBConv_wxwin(const wxChar
* name
)
1411 m_enc
= wxFontMapper::Get()->CharsetToEncoding(name
, false);
1413 m_enc
= wxFONTENCODING_SYSTEM
;
1418 wxMBConv_wxwin(wxFontEncoding enc
)
1425 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
1427 size_t inbuf
= strlen(psz
);
1429 m2w
.Convert(psz
,buf
);
1433 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
1435 const size_t inbuf
= wxWcslen(psz
);
1437 w2m
.Convert(psz
,buf
);
1442 bool IsOk() const { return m_ok
; }
1445 wxFontEncoding m_enc
;
1446 wxEncodingConverter m2w
, w2m
;
1448 // were we initialized successfully?
1451 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
1454 #endif // wxUSE_FONTMAP
1456 // ============================================================================
1457 // wxCSConv implementation
1458 // ============================================================================
1460 void wxCSConv::Init()
1467 wxCSConv::wxCSConv(const wxChar
*charset
)
1476 m_encoding
= wxFONTENCODING_SYSTEM
;
1479 wxCSConv::wxCSConv(wxFontEncoding encoding
)
1481 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
1483 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
1485 encoding
= wxFONTENCODING_SYSTEM
;
1490 m_encoding
= encoding
;
1493 wxCSConv::~wxCSConv()
1498 wxCSConv::wxCSConv(const wxCSConv
& conv
)
1503 SetName(conv
.m_name
);
1504 m_encoding
= conv
.m_encoding
;
1507 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
1511 SetName(conv
.m_name
);
1512 m_encoding
= conv
.m_encoding
;
1517 void wxCSConv::Clear()
1526 void wxCSConv::SetName(const wxChar
*charset
)
1530 m_name
= wxStrdup(charset
);
1535 wxMBConv
*wxCSConv::DoCreate() const
1537 // check for the special case of ASCII or ISO8859-1 charset: as we have
1538 // special knowledge of it anyhow, we don't need to create a special
1539 // conversion object
1540 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
1542 // don't convert at all
1546 // we trust OS to do conversion better than we can so try external
1547 // conversion methods first
1549 // the full order is:
1550 // 1. OS conversion (iconv() under Unix or Win32 API)
1551 // 2. hard coded conversions for UTF
1552 // 3. wxEncodingConverter as fall back
1558 #endif // !wxUSE_FONTMAP
1560 wxString
name(m_name
);
1564 name
= wxFontMapper::Get()->GetEncodingName(m_encoding
);
1565 #endif // wxUSE_FONTMAP
1567 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
1573 #endif // HAVE_ICONV
1575 #ifdef wxHAVE_WIN32_MB2WC
1577 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
1578 : new wxMBConv_win32(m_encoding
);
1584 #endif // wxHAVE_WIN32_MB2WC
1585 #if defined(__WXMAC__)
1587 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
) )
1590 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
1591 : new wxMBConv_mac(m_encoding
);
1600 wxFontEncoding enc
= m_encoding
;
1602 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
1604 // use "false" to suppress interactive dialogs -- we can be called from
1605 // anywhere and popping up a dialog from here is the last thing we want to
1607 enc
= wxFontMapper::Get()->CharsetToEncoding(m_name
, false);
1609 #endif // wxUSE_FONTMAP
1613 case wxFONTENCODING_UTF7
:
1614 return new wxMBConvUTF7
;
1616 case wxFONTENCODING_UTF8
:
1617 return new wxMBConvUTF8
;
1619 case wxFONTENCODING_UTF16BE
:
1620 return new wxMBConvUTF16BE
;
1622 case wxFONTENCODING_UTF16LE
:
1623 return new wxMBConvUTF16LE
;
1625 case wxFONTENCODING_UTF32BE
:
1626 return new wxMBConvUTF32BE
;
1628 case wxFONTENCODING_UTF32LE
:
1629 return new wxMBConvUTF32LE
;
1632 // nothing to do but put here to suppress gcc warnings
1639 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
1640 : new wxMBConv_wxwin(m_encoding
);
1646 #endif // wxUSE_FONTMAP
1648 // NB: This is a hack to prevent deadlock. What could otherwise happen
1649 // in Unicode build: wxConvLocal creation ends up being here
1650 // because of some failure and logs the error. But wxLog will try to
1651 // attach timestamp, for which it will need wxConvLocal (to convert
1652 // time to char* and then wchar_t*), but that fails, tries to log
1653 // error, but wxLog has a (already locked) critical section that
1654 // guards static buffer.
1655 static bool alreadyLoggingError
= false;
1656 if (!alreadyLoggingError
)
1658 alreadyLoggingError
= true;
1659 wxLogError(_("Cannot convert from the charset '%s'!"),
1663 wxFontMapper::GetEncodingDescription(m_encoding
).c_str()
1664 #else // !wxUSE_FONTMAP
1665 wxString::Format(_("encoding %s"), m_encoding
).c_str()
1666 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1668 alreadyLoggingError
= false;
1674 void wxCSConv::CreateConvIfNeeded() const
1678 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
1681 // if we don't have neither the name nor the encoding, use the default
1682 // encoding for this system
1683 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
1685 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
1687 #endif // wxUSE_INTL
1689 self
->m_convReal
= DoCreate();
1690 self
->m_deferred
= false;
1694 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1696 CreateConvIfNeeded();
1699 return m_convReal
->MB2WC(buf
, psz
, n
);
1702 size_t len
= strlen(psz
);
1706 for (size_t c
= 0; c
<= len
; c
++)
1707 buf
[c
] = (unsigned char)(psz
[c
]);
1713 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1715 CreateConvIfNeeded();
1718 return m_convReal
->WC2MB(buf
, psz
, n
);
1721 const size_t len
= wxWcslen(psz
);
1724 for (size_t c
= 0; c
<= len
; c
++)
1733 for (size_t c
= 0; c
<= len
; c
++)
1743 // ----------------------------------------------------------------------------
1745 // ----------------------------------------------------------------------------
1748 static wxMBConv_win32 wxConvLibcObj
;
1750 static wxMBConvLibc wxConvLibcObj
;
1753 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
1754 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
1755 static wxMBConvUTF7 wxConvUTF7Obj
;
1756 static wxMBConvUTF8 wxConvUTF8Obj
;
1759 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
1760 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
1761 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
1762 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
1763 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
1764 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
1766 #else // !wxUSE_WCHAR_T
1768 // stand-ins in absence of wchar_t
1769 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
1774 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T