1 /////////////////////////////////////////////////////////////////////////////
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // Licence: wxWindows licence
11 /////////////////////////////////////////////////////////////////////////////
13 // ============================================================================
15 // ============================================================================
17 // ----------------------------------------------------------------------------
19 // ----------------------------------------------------------------------------
21 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
22 #pragma implementation "strconv.h"
25 // For compilers that support precompilation, includes "wx.h".
26 #include "wx/wxprec.h"
37 #include "wx/strconv.h"
42 #include "wx/msw/private.h"
53 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
54 #define wxHAVE_WIN32_MB2WC
55 #endif // __WIN32__ but !__WXMICROWIN__
57 // ----------------------------------------------------------------------------
59 // ----------------------------------------------------------------------------
69 #include "wx/encconv.h"
70 #include "wx/fontmap.h"
73 #include "ATSUnicode.h"
74 #include "TextCommon.h"
75 #include "TextEncodingConverter.h"
77 #include "wx/mac/private.h" // includes mac headers
79 // ----------------------------------------------------------------------------
81 // ----------------------------------------------------------------------------
83 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
84 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
86 #if SIZEOF_WCHAR_T == 4
87 #define WC_NAME "UCS4"
88 #define WC_BSWAP BSWAP_UCS4
89 #ifdef WORDS_BIGENDIAN
90 #define WC_NAME_BEST "UCS-4BE"
92 #define WC_NAME_BEST "UCS-4LE"
94 #elif SIZEOF_WCHAR_T == 2
95 #define WC_NAME "UTF16"
96 #define WC_BSWAP BSWAP_UTF16
98 #ifdef WORDS_BIGENDIAN
99 #define WC_NAME_BEST "UTF-16BE"
101 #define WC_NAME_BEST "UTF-16LE"
103 #else // sizeof(wchar_t) != 2 nor 4
104 // does this ever happen?
105 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
108 // ============================================================================
110 // ============================================================================
112 // ----------------------------------------------------------------------------
113 // UTF-16 en/decoding to/from UCS-4
114 // ----------------------------------------------------------------------------
117 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
122 *output
= (wxUint16
) input
;
125 else if (input
>=0x110000)
133 *output
++ = (wxUint16
) ((input
>> 10)+0xd7c0);
134 *output
= (wxUint16
) ((input
&0x3ff)+0xdc00);
140 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
142 if ((*input
<0xd800) || (*input
>0xdfff))
147 else if ((input
[1]<0xdc00) || (input
[1]>=0xdfff))
154 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
160 // ----------------------------------------------------------------------------
162 // ----------------------------------------------------------------------------
164 wxMBConv::~wxMBConv()
166 // nothing to do here
169 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
173 // calculate the length of the buffer needed first
174 size_t nLen
= MB2WC(NULL
, psz
, 0);
175 if ( nLen
!= (size_t)-1 )
177 // now do the actual conversion
178 wxWCharBuffer
buf(nLen
);
179 MB2WC(buf
.data(), psz
, nLen
+ 1); // with the trailing NUL
185 wxWCharBuffer
buf((wchar_t *)NULL
);
190 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
194 size_t nLen
= WC2MB(NULL
, pwz
, 0);
195 if ( nLen
!= (size_t)-1 )
197 wxCharBuffer
buf(nLen
+3); // space for a wxUint32 trailing zero
198 WC2MB(buf
.data(), pwz
, nLen
+ 4);
204 wxCharBuffer
buf((char *)NULL
);
209 // ----------------------------------------------------------------------------
211 // ----------------------------------------------------------------------------
213 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
215 return wxMB2WC(buf
, psz
, n
);
218 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
220 return wxWC2MB(buf
, psz
, n
);
223 // ----------------------------------------------------------------------------
225 // ----------------------------------------------------------------------------
228 static char utf7_setD
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
229 "abcdefghijklmnopqrstuvwxyz"
230 "0123456789'(),-./:?";
231 static char utf7_setO
[]="!\"#$%&*;<=>@[]^_`{|}";
232 static char utf7_setB
[]="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
233 "abcdefghijklmnopqrstuvwxyz"
237 // TODO: write actual implementations of UTF-7 here
238 size_t wxMBConvUTF7::MB2WC(wchar_t * WXUNUSED(buf
),
239 const char * WXUNUSED(psz
),
240 size_t WXUNUSED(n
)) const
245 size_t wxMBConvUTF7::WC2MB(char * WXUNUSED(buf
),
246 const wchar_t * WXUNUSED(psz
),
247 size_t WXUNUSED(n
)) const
252 // ----------------------------------------------------------------------------
254 // ----------------------------------------------------------------------------
256 static wxUint32 utf8_max
[]=
257 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
259 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
263 while (*psz
&& ((!buf
) || (len
< n
)))
265 unsigned char cc
= *psz
++, fc
= cc
;
267 for (cnt
= 0; fc
& 0x80; cnt
++)
281 // invalid UTF-8 sequence
286 unsigned ocnt
= cnt
- 1;
287 wxUint32 res
= cc
& (0x3f >> cnt
);
291 if ((cc
& 0xC0) != 0x80)
293 // invalid UTF-8 sequence
296 res
= (res
<< 6) | (cc
& 0x3f);
298 if (res
<= utf8_max
[ocnt
])
300 // illegal UTF-8 encoding
304 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
305 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
306 if (pa
== (size_t)-1)
315 #endif // WC_UTF16/!WC_UTF16
319 if (buf
&& (len
< n
))
324 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
328 while (*psz
&& ((!buf
) || (len
< n
)))
332 // cast is ok for WC_UTF16
333 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
334 psz
+= (pa
== (size_t)-1) ? 1 : pa
;
336 cc
=(*psz
++) & 0x7fffffff;
339 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++) {}
353 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
355 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
360 if (buf
&& (len
<n
)) *buf
= 0;
368 // ----------------------------------------------------------------------------
370 // ----------------------------------------------------------------------------
372 #ifdef WORDS_BIGENDIAN
373 #define wxMBConvUTF16straight wxMBConvUTF16BE
374 #define wxMBConvUTF16swap wxMBConvUTF16LE
376 #define wxMBConvUTF16swap wxMBConvUTF16BE
377 #define wxMBConvUTF16straight wxMBConvUTF16LE
383 // copy 16bit MB to 16bit String
384 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
388 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
391 *buf
++ = *(wxUint16
*)psz
;
394 psz
+= sizeof(wxUint16
);
396 if (buf
&& len
<n
) *buf
=0;
402 // copy 16bit String to 16bit MB
403 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
407 while (*psz
&& (!buf
|| len
< n
))
411 *(wxUint16
*)buf
= *psz
;
412 buf
+= sizeof(wxUint16
);
414 len
+= sizeof(wxUint16
);
417 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
423 // swap 16bit MB to 16bit String
424 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
428 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
432 ((char *)buf
)[0] = psz
[1];
433 ((char *)buf
)[1] = psz
[0];
437 psz
+= sizeof(wxUint16
);
439 if (buf
&& len
<n
) *buf
=0;
445 // swap 16bit MB to 16bit String
446 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
450 while (*psz
&& (!buf
|| len
< n
))
454 *buf
++ = ((char*)psz
)[1];
455 *buf
++ = ((char*)psz
)[0];
457 len
+= sizeof(wxUint16
);
460 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
469 // copy 16bit MB to 32bit String
470 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
474 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
477 size_t pa
=decode_utf16((wxUint16
*)psz
, cc
);
478 if (pa
== (size_t)-1)
484 psz
+= pa
* sizeof(wxUint16
);
486 if (buf
&& len
<n
) *buf
=0;
492 // copy 32bit String to 16bit MB
493 size_t wxMBConvUTF16straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
497 while (*psz
&& (!buf
|| len
< n
))
500 size_t pa
=encode_utf16(*psz
, cc
);
502 if (pa
== (size_t)-1)
507 *(wxUint16
*)buf
= cc
[0];
508 buf
+= sizeof(wxUint16
);
511 *(wxUint16
*)buf
= cc
[1];
512 buf
+= sizeof(wxUint16
);
516 len
+= pa
*sizeof(wxUint16
);
519 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
525 // swap 16bit MB to 32bit String
526 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
530 while (*(wxUint16
*)psz
&& (!buf
|| len
< n
))
534 tmp
[0]=psz
[1]; tmp
[1]=psz
[0];
535 tmp
[2]=psz
[3]; tmp
[3]=psz
[2];
537 size_t pa
=decode_utf16((wxUint16
*)tmp
, cc
);
538 if (pa
== (size_t)-1)
545 psz
+= pa
* sizeof(wxUint16
);
547 if (buf
&& len
<n
) *buf
=0;
553 // swap 32bit String to 16bit MB
554 size_t wxMBConvUTF16swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
558 while (*psz
&& (!buf
|| len
< n
))
561 size_t pa
=encode_utf16(*psz
, cc
);
563 if (pa
== (size_t)-1)
568 *buf
++ = ((char*)cc
)[1];
569 *buf
++ = ((char*)cc
)[0];
572 *buf
++ = ((char*)cc
)[3];
573 *buf
++ = ((char*)cc
)[2];
577 len
+= pa
*sizeof(wxUint16
);
580 if (buf
&& len
<=n
-sizeof(wxUint16
)) *(wxUint16
*)buf
=0;
588 // ----------------------------------------------------------------------------
590 // ----------------------------------------------------------------------------
592 #ifdef WORDS_BIGENDIAN
593 #define wxMBConvUTF32straight wxMBConvUTF32BE
594 #define wxMBConvUTF32swap wxMBConvUTF32LE
596 #define wxMBConvUTF32swap wxMBConvUTF32BE
597 #define wxMBConvUTF32straight wxMBConvUTF32LE
601 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
602 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
607 // copy 32bit MB to 16bit String
608 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
612 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
616 size_t pa
=encode_utf16(*(wxUint32
*)psz
, cc
);
617 if (pa
== (size_t)-1)
627 psz
+= sizeof(wxUint32
);
629 if (buf
&& len
<n
) *buf
=0;
635 // copy 16bit String to 32bit MB
636 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
640 while (*psz
&& (!buf
|| len
< n
))
644 // cast is ok for WC_UTF16
645 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
646 if (pa
== (size_t)-1)
651 *(wxUint32
*)buf
= cc
;
652 buf
+= sizeof(wxUint32
);
654 len
+= sizeof(wxUint32
);
658 if (buf
&& len
<=n
-sizeof(wxUint32
))
666 // swap 32bit MB to 16bit String
667 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
671 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
674 tmp
[0] = psz
[3]; tmp
[1] = psz
[2];
675 tmp
[2] = psz
[1]; tmp
[3] = psz
[0];
680 size_t pa
=encode_utf16(*(wxUint32
*)tmp
, cc
);
681 if (pa
== (size_t)-1)
691 psz
+= sizeof(wxUint32
);
701 // swap 16bit String to 32bit MB
702 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
706 while (*psz
&& (!buf
|| len
< n
))
710 // cast is ok for WC_UTF16
711 size_t pa
=decode_utf16((const wxUint16
*)psz
, *(wxUint32
*)cc
);
712 if (pa
== (size_t)-1)
722 len
+= sizeof(wxUint32
);
726 if (buf
&& len
<=n
-sizeof(wxUint32
))
735 // copy 32bit MB to 32bit String
736 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
740 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
743 *buf
++ = *(wxUint32
*)psz
;
745 psz
+= sizeof(wxUint32
);
755 // copy 32bit String to 32bit MB
756 size_t wxMBConvUTF32straight::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
760 while (*psz
&& (!buf
|| len
< n
))
764 *(wxUint32
*)buf
= *psz
;
765 buf
+= sizeof(wxUint32
);
768 len
+= sizeof(wxUint32
);
772 if (buf
&& len
<=n
-sizeof(wxUint32
))
779 // swap 32bit MB to 32bit String
780 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
784 while (*(wxUint32
*)psz
&& (!buf
|| len
< n
))
788 ((char *)buf
)[0] = psz
[3];
789 ((char *)buf
)[1] = psz
[2];
790 ((char *)buf
)[2] = psz
[1];
791 ((char *)buf
)[3] = psz
[0];
795 psz
+= sizeof(wxUint32
);
805 // swap 32bit String to 32bit MB
806 size_t wxMBConvUTF32swap::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
810 while (*psz
&& (!buf
|| len
< n
))
814 *buf
++ = ((char *)psz
)[3];
815 *buf
++ = ((char *)psz
)[2];
816 *buf
++ = ((char *)psz
)[1];
817 *buf
++ = ((char *)psz
)[0];
819 len
+= sizeof(wxUint32
);
823 if (buf
&& len
<=n
-sizeof(wxUint32
))
833 // ============================================================================
834 // The classes doing conversion using the iconv_xxx() functions
835 // ============================================================================
839 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
840 // if output buffer is _exactly_ as big as needed. Such case is (unless there's
841 // yet another bug in glibc) the only case when iconv() returns with (size_t)-1
842 // (which means error) and says there are 0 bytes left in the input buffer --
843 // when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
844 // this alternative test for iconv() failure.
845 // [This bug does not appear in glibc 2.2.]
846 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
847 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
848 (errno != E2BIG || bufLeft != 0))
850 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
853 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
855 // ----------------------------------------------------------------------------
856 // wxMBConv_iconv: encapsulates an iconv character set
857 // ----------------------------------------------------------------------------
859 class wxMBConv_iconv
: public wxMBConv
862 wxMBConv_iconv(const wxChar
*name
);
863 virtual ~wxMBConv_iconv();
865 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
866 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
869 { return (m2w
!= (iconv_t
)-1) && (w2m
!= (iconv_t
)-1); }
872 // the iconv handlers used to translate from multibyte to wide char and in
873 // the other direction
878 // the name (for iconv_open()) of a wide char charset -- if none is
879 // available on this machine, it will remain NULL
880 static const char *ms_wcCharsetName
;
882 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
883 // different endian-ness than the native one
884 static bool ms_wcNeedsSwap
;
887 const char *wxMBConv_iconv::ms_wcCharsetName
= NULL
;
888 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
890 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
892 // Do it the hard way
894 for (size_t i
= 0; i
< wxStrlen(name
)+1; i
++)
895 cname
[i
] = (char) name
[i
];
897 // check for charset that represents wchar_t:
898 if (ms_wcCharsetName
== NULL
)
900 ms_wcNeedsSwap
= false;
902 // try charset with explicit bytesex info (e.g. "UCS-4LE"):
903 ms_wcCharsetName
= WC_NAME_BEST
;
904 m2w
= iconv_open(ms_wcCharsetName
, cname
);
906 if (m2w
== (iconv_t
)-1)
908 // try charset w/o bytesex info (e.g. "UCS4")
909 // and check for bytesex ourselves:
910 ms_wcCharsetName
= WC_NAME
;
911 m2w
= iconv_open(ms_wcCharsetName
, cname
);
913 // last bet, try if it knows WCHAR_T pseudo-charset
914 if (m2w
== (iconv_t
)-1)
916 ms_wcCharsetName
= "WCHAR_T";
917 m2w
= iconv_open(ms_wcCharsetName
, cname
);
920 if (m2w
!= (iconv_t
)-1)
922 char buf
[2], *bufPtr
;
923 wchar_t wbuf
[2], *wbufPtr
;
931 outsz
= SIZEOF_WCHAR_T
* 2;
935 res
= iconv(m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
936 (char**)&wbufPtr
, &outsz
);
938 if (ICONV_FAILED(res
, insz
))
940 ms_wcCharsetName
= NULL
;
941 wxLogLastError(wxT("iconv"));
942 wxLogError(_("Conversion to charset '%s' doesn't work."), name
);
946 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
951 ms_wcCharsetName
= NULL
;
953 // VS: we must not output an error here, since wxWindows will safely
954 // fall back to using wxEncodingConverter.
955 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name
);
959 wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName
, ms_wcNeedsSwap
);
961 else // we already have ms_wcCharsetName
963 m2w
= iconv_open(ms_wcCharsetName
, cname
);
966 // NB: don't ever pass NULL to iconv_open(), it may crash!
967 if ( ms_wcCharsetName
)
969 w2m
= iconv_open( cname
, ms_wcCharsetName
);
977 wxMBConv_iconv::~wxMBConv_iconv()
979 if ( m2w
!= (iconv_t
)-1 )
981 if ( w2m
!= (iconv_t
)-1 )
985 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
987 size_t inbuf
= strlen(psz
);
988 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
990 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
991 wchar_t *bufPtr
= buf
;
992 const char *pszPtr
= psz
;
996 // have destination buffer, convert there
998 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
999 (char**)&bufPtr
, &outbuf
);
1000 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1004 // convert to native endianness
1005 WC_BSWAP(buf
/* _not_ bufPtr */, res
)
1008 // NB: iconv was given only strlen(psz) characters on input, and so
1009 // it couldn't convert the trailing zero. Let's do it ourselves
1010 // if there's some room left for it in the output buffer.
1016 // no destination buffer... convert using temp buffer
1017 // to calculate destination buffer requirement
1022 outbuf
= 8*SIZEOF_WCHAR_T
;
1025 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1026 (char**)&bufPtr
, &outbuf
);
1028 res
+= 8-(outbuf
/SIZEOF_WCHAR_T
);
1029 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1032 if (ICONV_FAILED(cres
, inbuf
))
1034 //VS: it is ok if iconv fails, hence trace only
1035 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1042 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1044 size_t inbuf
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1048 wchar_t *tmpbuf
= 0;
1052 // need to copy to temp buffer to switch endianness
1053 // this absolutely doesn't rock!
1054 // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1055 // could be in read-only memory, or be accessed in some other thread)
1056 tmpbuf
=(wchar_t*)malloc((inbuf
+1)*SIZEOF_WCHAR_T
);
1057 memcpy(tmpbuf
,psz
,(inbuf
+1)*SIZEOF_WCHAR_T
);
1058 WC_BSWAP(tmpbuf
, inbuf
)
1064 // have destination buffer, convert there
1065 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1069 // NB: iconv was given only wcslen(psz) characters on input, and so
1070 // it couldn't convert the trailing zero. Let's do it ourselves
1071 // if there's some room left for it in the output buffer.
1077 // no destination buffer... convert using temp buffer
1078 // to calculate destination buffer requirement
1082 buf
= tbuf
; outbuf
= 16;
1084 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1087 } while ((cres
==(size_t)-1) && (errno
==E2BIG
));
1095 if (ICONV_FAILED(cres
, inbuf
))
1097 //VS: it is ok if iconv fails, hence trace only
1098 wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1105 #endif // HAVE_ICONV
1108 // ============================================================================
1109 // Win32 conversion classes
1110 // ============================================================================
1112 #ifdef wxHAVE_WIN32_MB2WC
1115 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1116 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1118 class wxMBConv_win32
: public wxMBConv
1123 m_CodePage
= CP_ACP
;
1126 wxMBConv_win32(const wxChar
* name
)
1128 m_CodePage
= wxCharsetToCodepage(name
);
1131 wxMBConv_win32(wxFontEncoding encoding
)
1133 m_CodePage
= wxEncodingToCodepage(encoding
);
1136 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1138 const size_t len
= ::MultiByteToWideChar
1140 m_CodePage
, // code page
1142 psz
, // input string
1143 -1, // its length (NUL-terminated)
1144 buf
, // output string
1145 buf
? n
: 0 // size of output buffer
1148 // note that it returns count of written chars for buf != NULL and size
1149 // of the needed buffer for buf == NULL so in either case the length of
1150 // the string (which never includes the terminating NUL) is one less
1151 return len
? len
- 1 : (size_t)-1;
1154 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1156 const size_t len
= ::WideCharToMultiByte
1158 m_CodePage
, // code page
1160 psz
, // input string
1161 -1, // it is (wide) NUL-terminated
1162 buf
, // output buffer
1163 buf
? n
: 0, // and its size
1164 NULL
, // default "replacement" char
1165 NULL
// [out] was it used?
1168 // see the comment above for the reason of "len - 1"
1169 return len
? len
- 1 : (size_t)-1;
1173 { return m_CodePage
!= -1; }
1179 #endif // wxHAVE_WIN32_MB2WC
1181 // ============================================================================
1182 // Mac conversion classes
1183 // ============================================================================
1185 #if defined(__WXMAC__) && defined(TARGET_CARBON)
1187 class wxMBConv_mac
: public wxMBConv
1192 Init(CFStringGetSystemEncoding()) ;
1195 wxMBConv_mac(const wxChar
* name
)
1197 Init( EncodingToSystem(wxFontMapper::Get()->CharsetToEncoding(name
, FALSE
) ) ) ;
1200 wxMBConv_mac(wxFontEncoding encoding
)
1202 Init( EncodingToSystem(encoding
) );
1207 OSStatus status
= noErr
;
1208 status
= TECDisposeConverter(m_MB2WC_converter
);
1209 status
= TECDisposeConverter(m_WC2MB_converter
);
1212 static TextEncodingBase
EncodingToSystem(wxFontEncoding encoding
)
1214 TextEncodingBase enc
= CFStringGetSystemEncoding() ;
1218 case wxFONTENCODING_ISO8859_1
:
1219 enc
= kTextEncodingISOLatin1
;
1221 case wxFONTENCODING_ISO8859_2
:
1222 enc
= kTextEncodingISOLatin2
;
1224 case wxFONTENCODING_ISO8859_3
:
1225 enc
= kTextEncodingISOLatin3
;
1227 case wxFONTENCODING_ISO8859_4
:
1228 enc
= kTextEncodingISOLatin4
;
1230 case wxFONTENCODING_ISO8859_5
:
1231 enc
= kTextEncodingISOLatinCyrillic
;
1233 case wxFONTENCODING_ISO8859_6
:
1234 enc
= kTextEncodingISOLatinArabic
;
1236 case wxFONTENCODING_ISO8859_7
:
1237 enc
= kTextEncodingISOLatinGreek
;
1239 case wxFONTENCODING_ISO8859_8
:
1240 enc
= kTextEncodingISOLatinHebrew
;
1242 case wxFONTENCODING_ISO8859_9
:
1243 enc
= kTextEncodingISOLatin5
;
1245 case wxFONTENCODING_ISO8859_10
:
1246 enc
= kTextEncodingISOLatin6
;
1248 case wxFONTENCODING_ISO8859_13
:
1249 enc
= kTextEncodingISOLatin7
;
1251 case wxFONTENCODING_ISO8859_14
:
1252 enc
= kTextEncodingISOLatin8
;
1254 case wxFONTENCODING_ISO8859_15
:
1255 enc
= kTextEncodingISOLatin9
;
1258 case wxFONTENCODING_KOI8
:
1259 enc
= kTextEncodingKOI8_R
;
1261 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
1262 enc
= kTextEncodingDOSRussian
;
1265 case wxFONTENCODING_BULGARIAN :
1269 case wxFONTENCODING_CP437
:
1270 enc
=kTextEncodingDOSLatinUS
;
1272 case wxFONTENCODING_CP850
:
1273 enc
= kTextEncodingDOSLatin1
;
1275 case wxFONTENCODING_CP852
:
1276 enc
= kTextEncodingDOSLatin2
;
1278 case wxFONTENCODING_CP855
:
1279 enc
= kTextEncodingDOSCyrillic
;
1281 case wxFONTENCODING_CP866
:
1282 enc
=kTextEncodingDOSRussian
;
1284 case wxFONTENCODING_CP874
:
1285 enc
= kTextEncodingDOSThai
;
1287 case wxFONTENCODING_CP932
:
1288 enc
= kTextEncodingDOSJapanese
;
1290 case wxFONTENCODING_CP936
:
1291 enc
=kTextEncodingDOSChineseSimplif
;
1293 case wxFONTENCODING_CP949
:
1294 enc
= kTextEncodingDOSKorean
;
1296 case wxFONTENCODING_CP950
:
1297 enc
= kTextEncodingDOSChineseTrad
;
1300 case wxFONTENCODING_CP1250
:
1301 enc
= kTextEncodingWindowsLatin2
;
1303 case wxFONTENCODING_CP1251
:
1304 enc
=kTextEncodingWindowsCyrillic
;
1306 case wxFONTENCODING_CP1252
:
1307 enc
=kTextEncodingWindowsLatin1
;
1309 case wxFONTENCODING_CP1253
:
1310 enc
= kTextEncodingWindowsGreek
;
1312 case wxFONTENCODING_CP1254
:
1313 enc
= kTextEncodingWindowsLatin5
;
1315 case wxFONTENCODING_CP1255
:
1316 enc
=kTextEncodingWindowsHebrew
;
1318 case wxFONTENCODING_CP1256
:
1319 enc
=kTextEncodingWindowsArabic
;
1321 case wxFONTENCODING_CP1257
:
1322 enc
= kTextEncodingWindowsBalticRim
;
1325 case wxFONTENCODING_UTF7
:
1326 enc
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicodeUTF7Format
) ;
1328 case wxFONTENCODING_UTF8
:
1329 enc
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicodeUTF8Format
) ;
1331 case wxFONTENCODING_EUC_JP
:
1332 enc
= kTextEncodingEUC_JP
;
1334 case wxFONTENCODING_UTF16BE
:
1335 enc
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
1337 case wxFONTENCODING_UTF16LE
:
1338 enc
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
1340 case wxFONTENCODING_UTF32BE
:
1341 enc
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode32BitFormat
) ;
1343 case wxFONTENCODING_UTF32LE
:
1344 enc
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode32BitFormat
) ;
1350 void Init( TextEncodingBase encoding
)
1352 OSStatus status
= noErr
;
1353 m_char_encoding
= encoding
;
1354 #if SIZEOF_WCHAR_T == 4
1355 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode32BitFormat
) ;
1357 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
,0,kUnicode16BitFormat
) ;
1359 status
= TECCreateConverter(&m_MB2WC_converter
,
1361 m_unicode_encoding
);
1362 status
= TECCreateConverter(&m_WC2MB_converter
,
1367 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1369 OSStatus status
= noErr
;
1370 ByteCount byteOutLen
;
1371 ByteCount byteInLen
= strlen(psz
) ;
1372 ByteCount byteBufferLen
= n
;
1373 wchar_t *tbuf
= NULL
;
1377 n
= byteInLen
* SIZEOF_WCHAR_T
;
1378 tbuf
= (wchar_t*) malloc( n
) ;
1381 status
= TECConvertText(m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
1382 (TextPtr
) (buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
1387 size_t res
= byteOutLen
/ SIZEOF_WCHAR_T
;
1388 if ( buf
&& res
< n
)
1394 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1396 OSStatus status
= noErr
;
1397 ByteCount byteOutLen
;
1398 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
1399 ByteCount byteBufferLen
= n
;
1406 tbuf
= (char*) malloc( n
) ;
1409 status
= TECConvertText(m_WC2MB_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
1410 (TextPtr
) ( buf
? buf
: tbuf
) , byteBufferLen
, &byteOutLen
);
1415 size_t res
= byteOutLen
;
1416 if ( buf
&& res
< n
)
1423 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
1426 TECObjectRef m_MB2WC_converter
;
1427 TECObjectRef m_WC2MB_converter
;
1429 TextEncodingBase m_char_encoding
;
1430 TextEncodingBase m_unicode_encoding
;
1433 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
1435 // ============================================================================
1436 // wxEncodingConverter based conversion classes
1437 // ============================================================================
1441 class wxMBConv_wxwin
: public wxMBConv
1446 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
1447 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
1451 // temporarily just use wxEncodingConverter stuff,
1452 // so that it works while a better implementation is built
1453 wxMBConv_wxwin(const wxChar
* name
)
1456 m_enc
= wxFontMapper::Get()->CharsetToEncoding(name
, false);
1458 m_enc
= wxFONTENCODING_SYSTEM
;
1463 wxMBConv_wxwin(wxFontEncoding enc
)
1470 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
1472 size_t inbuf
= strlen(psz
);
1474 m2w
.Convert(psz
,buf
);
1478 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
1480 const size_t inbuf
= wxWcslen(psz
);
1482 w2m
.Convert(psz
,buf
);
1487 bool IsOk() const { return m_ok
; }
1490 wxFontEncoding m_enc
;
1491 wxEncodingConverter m2w
, w2m
;
1493 // were we initialized successfully?
1496 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
1499 #endif // wxUSE_FONTMAP
1501 // ============================================================================
1502 // wxCSConv implementation
1503 // ============================================================================
1505 void wxCSConv::Init()
1512 wxCSConv::wxCSConv(const wxChar
*charset
)
1521 m_encoding
= wxFONTENCODING_SYSTEM
;
1524 wxCSConv::wxCSConv(wxFontEncoding encoding
)
1526 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
1528 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
1530 encoding
= wxFONTENCODING_SYSTEM
;
1535 m_encoding
= encoding
;
1538 wxCSConv::~wxCSConv()
1543 wxCSConv::wxCSConv(const wxCSConv
& conv
)
1548 SetName(conv
.m_name
);
1549 m_encoding
= conv
.m_encoding
;
1552 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
1556 SetName(conv
.m_name
);
1557 m_encoding
= conv
.m_encoding
;
1562 void wxCSConv::Clear()
1571 void wxCSConv::SetName(const wxChar
*charset
)
1575 m_name
= wxStrdup(charset
);
1580 wxMBConv
*wxCSConv::DoCreate() const
1582 // check for the special case of ASCII or ISO8859-1 charset: as we have
1583 // special knowledge of it anyhow, we don't need to create a special
1584 // conversion object
1585 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
1587 // don't convert at all
1591 // we trust OS to do conversion better than we can so try external
1592 // conversion methods first
1594 // the full order is:
1595 // 1. OS conversion (iconv() under Unix or Win32 API)
1596 // 2. hard coded conversions for UTF
1597 // 3. wxEncodingConverter as fall back
1603 #endif // !wxUSE_FONTMAP
1605 wxString
name(m_name
);
1609 name
= wxFontMapper::Get()->GetEncodingName(m_encoding
);
1610 #endif // wxUSE_FONTMAP
1612 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
1618 #endif // HAVE_ICONV
1620 #ifdef wxHAVE_WIN32_MB2WC
1622 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
1623 : new wxMBConv_win32(m_encoding
);
1629 #endif // wxHAVE_WIN32_MB2WC
1630 #if defined(__WXMAC__)
1632 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
) )
1635 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
1636 : new wxMBConv_mac(m_encoding
);
1645 wxFontEncoding enc
= m_encoding
;
1647 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
1649 // use "false" to suppress interactive dialogs -- we can be called from
1650 // anywhere and popping up a dialog from here is the last thing we want to
1652 enc
= wxFontMapper::Get()->CharsetToEncoding(m_name
, false);
1654 #endif // wxUSE_FONTMAP
1658 case wxFONTENCODING_UTF7
:
1659 return new wxMBConvUTF7
;
1661 case wxFONTENCODING_UTF8
:
1662 return new wxMBConvUTF8
;
1664 case wxFONTENCODING_UTF16BE
:
1665 return new wxMBConvUTF16BE
;
1667 case wxFONTENCODING_UTF16LE
:
1668 return new wxMBConvUTF16LE
;
1670 case wxFONTENCODING_UTF32BE
:
1671 return new wxMBConvUTF32BE
;
1673 case wxFONTENCODING_UTF32LE
:
1674 return new wxMBConvUTF32LE
;
1677 // nothing to do but put here to suppress gcc warnings
1684 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
1685 : new wxMBConv_wxwin(m_encoding
);
1691 #endif // wxUSE_FONTMAP
1693 // NB: This is a hack to prevent deadlock. What could otherwise happen
1694 // in Unicode build: wxConvLocal creation ends up being here
1695 // because of some failure and logs the error. But wxLog will try to
1696 // attach timestamp, for which it will need wxConvLocal (to convert
1697 // time to char* and then wchar_t*), but that fails, tries to log
1698 // error, but wxLog has a (already locked) critical section that
1699 // guards static buffer.
1700 static bool alreadyLoggingError
= false;
1701 if (!alreadyLoggingError
)
1703 alreadyLoggingError
= true;
1704 wxLogError(_("Cannot convert from the charset '%s'!"),
1708 wxFontMapper::GetEncodingDescription(m_encoding
).c_str()
1709 #else // !wxUSE_FONTMAP
1710 wxString::Format(_("encoding %s"), m_encoding
).c_str()
1711 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1713 alreadyLoggingError
= false;
1719 void wxCSConv::CreateConvIfNeeded() const
1723 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
1726 // if we don't have neither the name nor the encoding, use the default
1727 // encoding for this system
1728 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
1730 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
1732 #endif // wxUSE_INTL
1734 self
->m_convReal
= DoCreate();
1735 self
->m_deferred
= false;
1739 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1741 CreateConvIfNeeded();
1744 return m_convReal
->MB2WC(buf
, psz
, n
);
1747 size_t len
= strlen(psz
);
1751 for (size_t c
= 0; c
<= len
; c
++)
1752 buf
[c
] = (unsigned char)(psz
[c
]);
1758 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1760 CreateConvIfNeeded();
1763 return m_convReal
->WC2MB(buf
, psz
, n
);
1766 const size_t len
= wxWcslen(psz
);
1769 for (size_t c
= 0; c
<= len
; c
++)
1778 for (size_t c
= 0; c
<= len
; c
++)
1788 // ----------------------------------------------------------------------------
1790 // ----------------------------------------------------------------------------
1793 static wxMBConv_win32 wxConvLibcObj
;
1795 static wxMBConvLibc wxConvLibcObj
;
1798 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
1799 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
1800 static wxMBConvUTF7 wxConvUTF7Obj
;
1801 static wxMBConvUTF8 wxConvUTF8Obj
;
1804 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
1805 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
1806 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
1807 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
1808 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
1809 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
1811 #else // !wxUSE_WCHAR_T
1813 // stand-ins in absence of wchar_t
1814 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
1819 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T