1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
23 #include "wx/strconv.h"
28 #include "wx/msw/private.h"
29 #include "wx/msw/missing.h"
40 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
41 #define wxHAVE_WIN32_MB2WC
50 #include "wx/thread.h"
53 #include "wx/encconv.h"
54 #include "wx/fontmap.h"
59 #include <ATSUnicode.h>
60 #include <TextCommon.h>
61 #include <TextEncodingConverter.h>
64 // includes Mac headers
65 #include "wx/mac/private.h"
69 #define TRACE_STRCONV _T("strconv")
71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
73 #if SIZEOF_WCHAR_T == 2
78 // ============================================================================
80 // ============================================================================
82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
83 static bool NotAllNULs(const char *p
, size_t n
)
85 while ( n
&& *p
++ == '\0' )
91 // ----------------------------------------------------------------------------
92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
93 // ----------------------------------------------------------------------------
95 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
100 *output
= (wxUint16
) input
;
104 else if (input
>= 0x110000)
106 return wxCONV_FAILED
;
112 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
113 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
120 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
122 if ((*input
< 0xd800) || (*input
> 0xdfff))
127 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
130 return wxCONV_FAILED
;
134 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
140 typedef wchar_t wxDecodeSurrogate_t
;
142 typedef wxUint16 wxDecodeSurrogate_t
;
143 #endif // WC_UTF16/!WC_UTF16
145 // returns the next UTF-32 character from the wchar_t buffer and advances the
146 // pointer to the character after this one
148 // if an invalid character is found, *pSrc is set to NULL, the caller must
150 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
153 const size_t n
= decode_utf16(wx_reinterpret_cast(wxUint16
*, *pSrc
), out
);
154 if ( n
== wxCONV_FAILED
)
162 // ----------------------------------------------------------------------------
164 // ----------------------------------------------------------------------------
167 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
168 const char *src
, size_t srcLen
) const
170 // although new conversion classes are supposed to implement this function
171 // directly, the existins ones only implement the old MB2WC() and so, to
172 // avoid to have to rewrite all conversion classes at once, we provide a
173 // default (but not efficient) implementation of this one in terms of the
174 // old function by copying the input to ensure that it's NUL-terminated and
175 // then using MB2WC() to convert it
177 // the number of chars [which would be] written to dst [if it were not NULL]
178 size_t dstWritten
= 0;
180 // the number of NULs terminating this string
181 size_t nulLen
wxDUMMY_INITIALIZE(0);
183 // if we were not given the input size we just have to assume that the
184 // string is properly terminated as we have no way of knowing how long it
185 // is anyhow, but if we do have the size check whether there are enough
189 if ( srcLen
!= wxNO_LEN
)
191 // we need to know how to find the end of this string
192 nulLen
= GetMBNulLen();
193 if ( nulLen
== wxCONV_FAILED
)
194 return wxCONV_FAILED
;
196 // if there are enough NULs we can avoid the copy
197 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
199 // make a copy in order to properly NUL-terminate the string
200 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
201 char * const p
= bufTmp
.data();
202 memcpy(p
, src
, srcLen
);
203 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
209 srcEnd
= src
+ srcLen
;
211 else // quit after the first loop iteration
218 // try to convert the current chunk
219 size_t lenChunk
= MB2WC(NULL
, src
, 0);
220 if ( lenChunk
== wxCONV_FAILED
)
221 return wxCONV_FAILED
;
223 lenChunk
++; // for the L'\0' at the end of this chunk
225 dstWritten
+= lenChunk
;
229 // nothing left in the input string, conversion succeeded
235 if ( dstWritten
> dstLen
)
236 return wxCONV_FAILED
;
238 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
239 return wxCONV_FAILED
;
246 // we convert just one chunk in this case as this is the entire
251 // advance the input pointer past the end of this chunk
252 while ( NotAllNULs(src
, nulLen
) )
254 // notice that we must skip over multiple bytes here as we suppose
255 // that if NUL takes 2 or 4 bytes, then all the other characters do
256 // too and so if advanced by a single byte we might erroneously
257 // detect sequences of NUL bytes in the middle of the input
261 src
+= nulLen
; // skipping over its terminator as well
263 // note that ">=" (and not just "==") is needed here as the terminator
264 // we skipped just above could be inside or just after the buffer
265 // delimited by inEnd
274 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
275 const wchar_t *src
, size_t srcLen
) const
277 // the number of chars [which would be] written to dst [if it were not NULL]
278 size_t dstWritten
= 0;
280 // make a copy of the input string unless it is already properly
283 // if we don't know its length we have no choice but to assume that it is,
284 // indeed, properly terminated
285 wxWCharBuffer bufTmp
;
286 if ( srcLen
== wxNO_LEN
)
288 srcLen
= wxWcslen(src
) + 1;
290 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
292 // make a copy in order to properly NUL-terminate the string
293 bufTmp
= wxWCharBuffer(srcLen
);
294 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
298 const size_t lenNul
= GetMBNulLen();
299 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
301 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
303 // try to convert the current chunk
304 size_t lenChunk
= WC2MB(NULL
, src
, 0);
306 if ( lenChunk
== wxCONV_FAILED
)
307 return wxCONV_FAILED
;
310 dstWritten
+= lenChunk
;
314 if ( dstWritten
> dstLen
)
315 return wxCONV_FAILED
;
317 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
318 return wxCONV_FAILED
;
327 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
329 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
330 if ( rc
!= wxCONV_FAILED
)
332 // ToWChar() returns the buffer length, i.e. including the trailing
333 // NUL, while this method doesn't take it into account
340 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
342 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
343 if ( rc
!= wxCONV_FAILED
)
351 wxMBConv::~wxMBConv()
353 // nothing to do here (necessary for Darwin linking probably)
356 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
360 // calculate the length of the buffer needed first
361 const size_t nLen
= MB2WC(NULL
, psz
, 0);
362 if ( nLen
!= wxCONV_FAILED
)
364 // now do the actual conversion
365 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
367 // +1 for the trailing NULL
368 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
373 return wxWCharBuffer();
376 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
380 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
381 if ( nLen
!= wxCONV_FAILED
)
383 // extra space for trailing NUL(s)
384 static const size_t extraLen
= GetMaxMBNulLen();
386 wxCharBuffer
buf(nLen
+ extraLen
- 1);
387 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
392 return wxCharBuffer();
396 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
398 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
399 if ( dstLen
!= wxCONV_FAILED
)
401 wxWCharBuffer
wbuf(dstLen
- 1);
402 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
407 if ( wbuf
[dstLen
- 1] == L
'\0' )
418 return wxWCharBuffer();
422 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
424 const size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
425 if ( dstLen
!= wxCONV_FAILED
)
427 wxCharBuffer
buf(dstLen
- 1);
428 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
434 const size_t nulLen
= GetMBNulLen();
435 if ( !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
437 // in this case the output is NUL-terminated and we're not
438 // supposed to count NUL
450 return wxCharBuffer();
453 // ----------------------------------------------------------------------------
455 // ----------------------------------------------------------------------------
457 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
459 return wxMB2WC(buf
, psz
, n
);
462 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
464 return wxWC2MB(buf
, psz
, n
);
467 // ----------------------------------------------------------------------------
468 // wxConvBrokenFileNames
469 // ----------------------------------------------------------------------------
473 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
475 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
476 || wxStricmp(charset
, _T("UTF8")) == 0 )
477 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
479 m_conv
= new wxCSConv(charset
);
484 // ----------------------------------------------------------------------------
486 // ----------------------------------------------------------------------------
488 // Implementation (C) 2004 Fredrik Roubert
491 // BASE64 decoding table
493 static const unsigned char utf7unb64
[] =
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
501 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
502 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
504 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
505 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
506 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
508 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
509 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
510 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
529 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
533 while ( *psz
&& (!buf
|| (len
< n
)) )
535 unsigned char cc
= *psz
++;
543 else if (*psz
== '-')
551 else // start of BASE64 encoded string
555 for ( ok
= lsb
= false, d
= 0, l
= 0;
556 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
561 for (l
+= 6; l
>= 8; lsb
= !lsb
)
563 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
573 *buf
= (wchar_t)(c
<< 8);
582 // in valid UTF7 we should have valid characters after '+'
583 return wxCONV_FAILED
;
591 if ( buf
&& (len
< n
) )
598 // BASE64 encoding table
600 static const unsigned char utf7enb64
[] =
602 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
603 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
604 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
605 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
606 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
607 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
608 'w', 'x', 'y', 'z', '0', '1', '2', '3',
609 '4', '5', '6', '7', '8', '9', '+', '/'
613 // UTF-7 encoding table
615 // 0 - Set D (directly encoded characters)
616 // 1 - Set O (optional direct characters)
617 // 2 - whitespace characters (optional)
618 // 3 - special characters
620 static const unsigned char utf7encode
[128] =
622 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
623 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
624 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
626 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
628 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
632 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
636 while (*psz
&& ((!buf
) || (len
< n
)))
639 if (cc
< 0x80 && utf7encode
[cc
] < 1)
648 else if (((wxUint32
)cc
) > 0xffff)
650 // no surrogate pair generation (yet?)
651 return wxCONV_FAILED
;
662 // BASE64 encode string
663 unsigned int lsb
, d
, l
;
664 for (d
= 0, l
= 0; /*nothing*/; psz
++)
666 for (lsb
= 0; lsb
< 2; lsb
++)
669 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
671 for (l
+= 8; l
>= 6; )
675 *buf
++ = utf7enb64
[(d
>> l
) % 64];
681 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
688 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
700 if (buf
&& (len
< n
))
706 // ----------------------------------------------------------------------------
708 // ----------------------------------------------------------------------------
710 static wxUint32 utf8_max
[]=
711 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
713 // boundaries of the private use area we use to (temporarily) remap invalid
714 // characters invalid in a UTF-8 encoded string
715 const wxUint32 wxUnicodePUA
= 0x100000;
716 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
718 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
722 while (*psz
&& ((!buf
) || (len
< n
)))
724 const char *opsz
= psz
;
725 bool invalid
= false;
726 unsigned char cc
= *psz
++, fc
= cc
;
728 for (cnt
= 0; fc
& 0x80; cnt
++)
738 // escape the escape character for octal escapes
739 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
740 && cc
== '\\' && (!buf
|| len
< n
))
752 // invalid UTF-8 sequence
757 unsigned ocnt
= cnt
- 1;
758 wxUint32 res
= cc
& (0x3f >> cnt
);
762 if ((cc
& 0xC0) != 0x80)
764 // invalid UTF-8 sequence
770 res
= (res
<< 6) | (cc
& 0x3f);
773 if (invalid
|| res
<= utf8_max
[ocnt
])
775 // illegal UTF-8 encoding
778 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
779 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
781 // if one of our PUA characters turns up externally
782 // it must also be treated as an illegal sequence
783 // (a bit like you have to escape an escape character)
789 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
790 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
791 if (pa
== wxCONV_FAILED
)
803 *buf
++ = (wchar_t)res
;
805 #endif // WC_UTF16/!WC_UTF16
811 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
813 while (opsz
< psz
&& (!buf
|| len
< n
))
816 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
817 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
818 wxASSERT(pa
!= wxCONV_FAILED
);
825 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
831 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
833 while (opsz
< psz
&& (!buf
|| len
< n
))
835 if ( buf
&& len
+ 3 < n
)
837 unsigned char on
= *opsz
;
839 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
840 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
841 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
848 else // MAP_INVALID_UTF8_NOT
850 return wxCONV_FAILED
;
856 if (buf
&& (len
< n
))
862 static inline bool isoctal(wchar_t wch
)
864 return L
'0' <= wch
&& wch
<= L
'7';
867 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
871 while (*psz
&& ((!buf
) || (len
< n
)))
876 // cast is ok for WC_UTF16
877 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
878 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
880 cc
= (*psz
++) & 0x7fffffff;
883 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
884 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
887 *buf
++ = (char)(cc
- wxUnicodePUA
);
890 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
891 && cc
== L
'\\' && psz
[0] == L
'\\' )
898 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
900 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
904 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
905 (psz
[1] - L
'0') * 010 +
915 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
931 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
933 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
939 if (buf
&& (len
< n
))
945 // ============================================================================
947 // ============================================================================
949 #ifdef WORDS_BIGENDIAN
950 #define wxMBConvUTF16straight wxMBConvUTF16BE
951 #define wxMBConvUTF16swap wxMBConvUTF16LE
953 #define wxMBConvUTF16swap wxMBConvUTF16BE
954 #define wxMBConvUTF16straight wxMBConvUTF16LE
958 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
960 if ( srcLen
== wxNO_LEN
)
962 // count the number of bytes in input, including the trailing NULs
963 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
964 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
967 srcLen
*= BYTES_PER_CHAR
;
969 else // we already have the length
971 // we can only convert an entire number of UTF-16 characters
972 if ( srcLen
% BYTES_PER_CHAR
)
973 return wxCONV_FAILED
;
979 // case when in-memory representation is UTF-16 too
982 // ----------------------------------------------------------------------------
983 // conversions without endianness change
984 // ----------------------------------------------------------------------------
987 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
988 const char *src
, size_t srcLen
) const
990 // set up the scene for using memcpy() (which is presumably more efficient
991 // than copying the bytes one by one)
992 srcLen
= GetLength(src
, srcLen
);
993 if ( srcLen
== wxNO_LEN
)
994 return wxCONV_FAILED
;
996 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
999 if ( dstLen
< inLen
)
1000 return wxCONV_FAILED
;
1002 memcpy(dst
, src
, srcLen
);
1009 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1010 const wchar_t *src
, size_t srcLen
) const
1012 if ( srcLen
== wxNO_LEN
)
1013 srcLen
= wxWcslen(src
) + 1;
1015 srcLen
*= BYTES_PER_CHAR
;
1019 if ( dstLen
< srcLen
)
1020 return wxCONV_FAILED
;
1022 memcpy(dst
, src
, srcLen
);
1028 // ----------------------------------------------------------------------------
1029 // endian-reversing conversions
1030 // ----------------------------------------------------------------------------
1033 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1034 const char *src
, size_t srcLen
) const
1036 srcLen
= GetLength(src
, srcLen
);
1037 if ( srcLen
== wxNO_LEN
)
1038 return wxCONV_FAILED
;
1040 srcLen
/= BYTES_PER_CHAR
;
1044 if ( dstLen
< srcLen
)
1045 return wxCONV_FAILED
;
1047 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1048 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1050 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1058 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1059 const wchar_t *src
, size_t srcLen
) const
1061 if ( srcLen
== wxNO_LEN
)
1062 srcLen
= wxWcslen(src
) + 1;
1064 srcLen
*= BYTES_PER_CHAR
;
1068 if ( dstLen
< srcLen
)
1069 return wxCONV_FAILED
;
1071 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1072 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1074 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1081 #else // !WC_UTF16: wchar_t is UTF-32
1083 // ----------------------------------------------------------------------------
1084 // conversions without endianness change
1085 // ----------------------------------------------------------------------------
1088 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1089 const char *src
, size_t srcLen
) const
1091 srcLen
= GetLength(src
, srcLen
);
1092 if ( srcLen
== wxNO_LEN
)
1093 return wxCONV_FAILED
;
1095 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1098 // optimization: return maximal space which could be needed for this
1099 // string even if the real size could be smaller if the buffer contains
1105 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1106 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1108 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1110 return wxCONV_FAILED
;
1112 if ( ++outLen
> dstLen
)
1113 return wxCONV_FAILED
;
1123 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1124 const wchar_t *src
, size_t srcLen
) const
1126 if ( srcLen
== wxNO_LEN
)
1127 srcLen
= wxWcslen(src
) + 1;
1130 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1131 for ( size_t n
= 0; n
< srcLen
; n
++ )
1134 const size_t numChars
= encode_utf16(*src
++, cc
);
1135 if ( numChars
== wxCONV_FAILED
)
1136 return wxCONV_FAILED
;
1138 outLen
+= numChars
* BYTES_PER_CHAR
;
1141 if ( outLen
> dstLen
)
1142 return wxCONV_FAILED
;
1145 if ( numChars
== 2 )
1147 // second character of a surrogate
1156 // ----------------------------------------------------------------------------
1157 // endian-reversing conversions
1158 // ----------------------------------------------------------------------------
1161 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1162 const char *src
, size_t srcLen
) const
1164 srcLen
= GetLength(src
, srcLen
);
1165 if ( srcLen
== wxNO_LEN
)
1166 return wxCONV_FAILED
;
1168 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1171 // optimization: return maximal space which could be needed for this
1172 // string even if the real size could be smaller if the buffer contains
1178 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1179 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1184 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1186 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1188 const size_t numChars
= decode_utf16(tmp
, ch
);
1189 if ( numChars
== wxCONV_FAILED
)
1190 return wxCONV_FAILED
;
1192 if ( numChars
== 2 )
1195 if ( ++outLen
> dstLen
)
1196 return wxCONV_FAILED
;
1206 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1207 const wchar_t *src
, size_t srcLen
) const
1209 if ( srcLen
== wxNO_LEN
)
1210 srcLen
= wxWcslen(src
) + 1;
1213 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1214 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1217 const size_t numChars
= encode_utf16(*src
, cc
);
1218 if ( numChars
== wxCONV_FAILED
)
1219 return wxCONV_FAILED
;
1221 outLen
+= numChars
* BYTES_PER_CHAR
;
1224 if ( outLen
> dstLen
)
1225 return wxCONV_FAILED
;
1227 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1228 if ( numChars
== 2 )
1230 // second character of a surrogate
1231 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1239 #endif // WC_UTF16/!WC_UTF16
1242 // ============================================================================
1244 // ============================================================================
1246 #ifdef WORDS_BIGENDIAN
1247 #define wxMBConvUTF32straight wxMBConvUTF32BE
1248 #define wxMBConvUTF32swap wxMBConvUTF32LE
1250 #define wxMBConvUTF32swap wxMBConvUTF32BE
1251 #define wxMBConvUTF32straight wxMBConvUTF32LE
1255 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1256 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1259 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1261 if ( srcLen
== wxNO_LEN
)
1263 // count the number of bytes in input, including the trailing NULs
1264 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1265 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1268 srcLen
*= BYTES_PER_CHAR
;
1270 else // we already have the length
1272 // we can only convert an entire number of UTF-32 characters
1273 if ( srcLen
% BYTES_PER_CHAR
)
1274 return wxCONV_FAILED
;
1280 // case when in-memory representation is UTF-16
1283 // ----------------------------------------------------------------------------
1284 // conversions without endianness change
1285 // ----------------------------------------------------------------------------
1288 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1289 const char *src
, size_t srcLen
) const
1291 srcLen
= GetLength(src
, srcLen
);
1292 if ( srcLen
== wxNO_LEN
)
1293 return wxCONV_FAILED
;
1295 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1296 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1298 for ( size_t n
= 0; n
< inLen
; n
++ )
1301 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1302 if ( numChars
== wxCONV_FAILED
)
1303 return wxCONV_FAILED
;
1308 if ( outLen
> dstLen
)
1309 return wxCONV_FAILED
;
1312 if ( numChars
== 2 )
1314 // second character of a surrogate
1324 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1325 const wchar_t *src
, size_t srcLen
) const
1327 if ( srcLen
== wxNO_LEN
)
1328 srcLen
= wxWcslen(src
) + 1;
1332 // optimization: return maximal space which could be needed for this
1333 // string instead of the exact amount which could be less if there are
1334 // any surrogates in the input
1336 // we consider that surrogates are rare enough to make it worthwhile to
1337 // avoid running the loop below at the cost of slightly extra memory
1339 return srcLen
* BYTES_PER_CHAR
;
1342 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1344 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1346 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1348 return wxCONV_FAILED
;
1350 outLen
+= BYTES_PER_CHAR
;
1352 if ( outLen
> dstLen
)
1353 return wxCONV_FAILED
;
1361 // ----------------------------------------------------------------------------
1362 // endian-reversing conversions
1363 // ----------------------------------------------------------------------------
1366 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1367 const char *src
, size_t srcLen
) const
1369 srcLen
= GetLength(src
, srcLen
);
1370 if ( srcLen
== wxNO_LEN
)
1371 return wxCONV_FAILED
;
1373 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1374 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1376 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1379 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1380 if ( numChars
== wxCONV_FAILED
)
1381 return wxCONV_FAILED
;
1386 if ( outLen
> dstLen
)
1387 return wxCONV_FAILED
;
1390 if ( numChars
== 2 )
1392 // second character of a surrogate
1402 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1403 const wchar_t *src
, size_t srcLen
) const
1405 if ( srcLen
== wxNO_LEN
)
1406 srcLen
= wxWcslen(src
) + 1;
1410 // optimization: return maximal space which could be needed for this
1411 // string instead of the exact amount which could be less if there are
1412 // any surrogates in the input
1414 // we consider that surrogates are rare enough to make it worthwhile to
1415 // avoid running the loop below at the cost of slightly extra memory
1417 return srcLen
*BYTES_PER_CHAR
;
1420 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1422 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1424 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1426 return wxCONV_FAILED
;
1428 outLen
+= BYTES_PER_CHAR
;
1430 if ( outLen
> dstLen
)
1431 return wxCONV_FAILED
;
1433 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1439 #else // !WC_UTF16: wchar_t is UTF-32
1441 // ----------------------------------------------------------------------------
1442 // conversions without endianness change
1443 // ----------------------------------------------------------------------------
1446 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1447 const char *src
, size_t srcLen
) const
1449 // use memcpy() as it should be much faster than hand-written loop
1450 srcLen
= GetLength(src
, srcLen
);
1451 if ( srcLen
== wxNO_LEN
)
1452 return wxCONV_FAILED
;
1454 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1457 if ( dstLen
< inLen
)
1458 return wxCONV_FAILED
;
1460 memcpy(dst
, src
, srcLen
);
1467 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1468 const wchar_t *src
, size_t srcLen
) const
1470 if ( srcLen
== wxNO_LEN
)
1471 srcLen
= wxWcslen(src
) + 1;
1473 srcLen
*= BYTES_PER_CHAR
;
1477 if ( dstLen
< srcLen
)
1478 return wxCONV_FAILED
;
1480 memcpy(dst
, src
, srcLen
);
1486 // ----------------------------------------------------------------------------
1487 // endian-reversing conversions
1488 // ----------------------------------------------------------------------------
1491 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1492 const char *src
, size_t srcLen
) const
1494 srcLen
= GetLength(src
, srcLen
);
1495 if ( srcLen
== wxNO_LEN
)
1496 return wxCONV_FAILED
;
1498 srcLen
/= BYTES_PER_CHAR
;
1502 if ( dstLen
< srcLen
)
1503 return wxCONV_FAILED
;
1505 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1506 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1508 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1516 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1517 const wchar_t *src
, size_t srcLen
) const
1519 if ( srcLen
== wxNO_LEN
)
1520 srcLen
= wxWcslen(src
) + 1;
1522 srcLen
*= BYTES_PER_CHAR
;
1526 if ( dstLen
< srcLen
)
1527 return wxCONV_FAILED
;
1529 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1530 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1532 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1539 #endif // WC_UTF16/!WC_UTF16
1542 // ============================================================================
1543 // The classes doing conversion using the iconv_xxx() functions
1544 // ============================================================================
1548 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1549 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1550 // (unless there's yet another bug in glibc) the only case when iconv()
1551 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1552 // left in the input buffer -- when _real_ error occurs,
1553 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1555 // [This bug does not appear in glibc 2.2.]
1556 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1557 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1558 (errno != E2BIG || bufLeft != 0))
1560 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1563 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1565 #define ICONV_T_INVALID ((iconv_t)-1)
1567 #if SIZEOF_WCHAR_T == 4
1568 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1569 #define WC_ENC wxFONTENCODING_UTF32
1570 #elif SIZEOF_WCHAR_T == 2
1571 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1572 #define WC_ENC wxFONTENCODING_UTF16
1573 #else // sizeof(wchar_t) != 2 nor 4
1574 // does this ever happen?
1575 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1578 // ----------------------------------------------------------------------------
1579 // wxMBConv_iconv: encapsulates an iconv character set
1580 // ----------------------------------------------------------------------------
1582 class wxMBConv_iconv
: public wxMBConv
1585 wxMBConv_iconv(const wxChar
*name
);
1586 virtual ~wxMBConv_iconv();
1588 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1589 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1591 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1592 virtual size_t GetMBNulLen() const;
1594 virtual wxMBConv
*Clone() const
1596 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
);
1597 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1602 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1605 // the iconv handlers used to translate from multibyte
1606 // to wide char and in the other direction
1611 // guards access to m2w and w2m objects
1612 wxMutex m_iconvMutex
;
1616 // the name (for iconv_open()) of a wide char charset -- if none is
1617 // available on this machine, it will remain NULL
1618 static wxString ms_wcCharsetName
;
1620 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1621 // different endian-ness than the native one
1622 static bool ms_wcNeedsSwap
;
1625 // name of the encoding handled by this conversion
1628 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1630 size_t m_minMBCharWidth
;
1633 // make the constructor available for unit testing
1634 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1636 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1637 if ( !result
->IsOk() )
1646 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1647 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1649 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1652 m_minMBCharWidth
= 0;
1654 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1655 // names for the charsets
1656 const wxCharBuffer
cname(wxString(name
).ToAscii());
1658 // check for charset that represents wchar_t:
1659 if ( ms_wcCharsetName
.empty() )
1661 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1664 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1665 #else // !wxUSE_FONTMAP
1666 static const wxChar
*names
[] =
1668 #if SIZEOF_WCHAR_T == 4
1670 #elif SIZEOF_WCHAR_T = 2
1675 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1677 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1679 const wxString
nameCS(*names
);
1681 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1682 wxString
nameXE(nameCS
);
1684 #ifdef WORDS_BIGENDIAN
1686 #else // little endian
1690 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1693 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1694 if ( m2w
== ICONV_T_INVALID
)
1696 // try charset w/o bytesex info (e.g. "UCS4")
1697 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1699 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1701 // and check for bytesex ourselves:
1702 if ( m2w
!= ICONV_T_INVALID
)
1704 char buf
[2], *bufPtr
;
1705 wchar_t wbuf
[2], *wbufPtr
;
1713 outsz
= SIZEOF_WCHAR_T
* 2;
1718 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1719 (char**)&wbufPtr
, &outsz
);
1721 if (ICONV_FAILED(res
, insz
))
1723 wxLogLastError(wxT("iconv"));
1724 wxLogError(_("Conversion to charset '%s' doesn't work."),
1727 else // ok, can convert to this encoding, remember it
1729 ms_wcCharsetName
= nameCS
;
1730 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1734 else // use charset not requiring byte swapping
1736 ms_wcCharsetName
= nameXE
;
1740 wxLogTrace(TRACE_STRCONV
,
1741 wxT("iconv wchar_t charset is \"%s\"%s"),
1742 ms_wcCharsetName
.empty() ? _T("<none>")
1743 : ms_wcCharsetName
.c_str(),
1744 ms_wcNeedsSwap
? _T(" (needs swap)")
1747 else // we already have ms_wcCharsetName
1749 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1752 if ( ms_wcCharsetName
.empty() )
1754 w2m
= ICONV_T_INVALID
;
1758 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1759 if ( w2m
== ICONV_T_INVALID
)
1761 wxLogTrace(TRACE_STRCONV
,
1762 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1763 ms_wcCharsetName
.c_str(), cname
.data());
1768 wxMBConv_iconv::~wxMBConv_iconv()
1770 if ( m2w
!= ICONV_T_INVALID
)
1772 if ( w2m
!= ICONV_T_INVALID
)
1776 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1778 // find the string length: notice that must be done differently for
1779 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1781 const size_t nulLen
= GetMBNulLen();
1785 return wxCONV_FAILED
;
1788 inbuf
= strlen(psz
); // arguably more optimized than our version
1793 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1794 // they also have to start at character boundary and not span two
1795 // adjacent characters
1797 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1804 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1805 // Unfortunately there is a couple of global wxCSConv objects such as
1806 // wxConvLocal that are used all over wx code, so we have to make sure
1807 // the handle is used by at most one thread at the time. Otherwise
1808 // only a few wx classes would be safe to use from non-main threads
1809 // as MB<->WC conversion would fail "randomly".
1810 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1811 #endif // wxUSE_THREADS
1813 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1815 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1816 wchar_t *bufPtr
= buf
;
1817 const char *pszPtr
= psz
;
1821 // have destination buffer, convert there
1823 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1824 (char**)&bufPtr
, &outbuf
);
1825 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1829 // convert to native endianness
1830 for ( unsigned i
= 0; i
< res
; i
++ )
1831 buf
[n
] = WC_BSWAP(buf
[i
]);
1834 // NUL-terminate the string if there is any space left
1840 // no destination buffer... convert using temp buffer
1841 // to calculate destination buffer requirement
1848 outbuf
= 8 * SIZEOF_WCHAR_T
;
1851 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1852 (char**)&bufPtr
, &outbuf
);
1854 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
1856 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1859 if (ICONV_FAILED(cres
, inbuf
))
1861 //VS: it is ok if iconv fails, hence trace only
1862 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1863 return wxCONV_FAILED
;
1869 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1872 // NB: explained in MB2WC
1873 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1876 size_t inlen
= wxWcslen(psz
);
1877 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1881 wchar_t *tmpbuf
= 0;
1885 // need to copy to temp buffer to switch endianness
1886 // (doing WC_BSWAP twice on the original buffer won't help, as it
1887 // could be in read-only memory, or be accessed in some other thread)
1888 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1889 for ( size_t i
= 0; i
< inlen
; i
++ )
1890 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1892 tmpbuf
[inlen
] = L
'\0';
1898 // have destination buffer, convert there
1899 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1903 // NB: iconv was given only wcslen(psz) characters on input, and so
1904 // it couldn't convert the trailing zero. Let's do it ourselves
1905 // if there's some room left for it in the output buffer.
1911 // no destination buffer: convert using temp buffer
1912 // to calculate destination buffer requirement
1920 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1924 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1932 if (ICONV_FAILED(cres
, inbuf
))
1934 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1935 return wxCONV_FAILED
;
1941 size_t wxMBConv_iconv::GetMBNulLen() const
1943 if ( m_minMBCharWidth
== 0 )
1945 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1948 // NB: explained in MB2WC
1949 wxMutexLocker
lock(self
->m_iconvMutex
);
1952 wchar_t *wnul
= L
"";
1953 char buf
[8]; // should be enough for NUL in any encoding
1954 size_t inLen
= sizeof(wchar_t),
1955 outLen
= WXSIZEOF(buf
);
1956 char *inBuff
= (char *)wnul
;
1957 char *outBuff
= buf
;
1958 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
1960 self
->m_minMBCharWidth
= (size_t)-1;
1964 self
->m_minMBCharWidth
= outBuff
- buf
;
1968 return m_minMBCharWidth
;
1971 #endif // HAVE_ICONV
1974 // ============================================================================
1975 // Win32 conversion classes
1976 // ============================================================================
1978 #ifdef wxHAVE_WIN32_MB2WC
1982 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1983 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1986 class wxMBConv_win32
: public wxMBConv
1991 m_CodePage
= CP_ACP
;
1992 m_minMBCharWidth
= 0;
1995 wxMBConv_win32(const wxMBConv_win32
& conv
)
1997 m_CodePage
= conv
.m_CodePage
;
1998 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2002 wxMBConv_win32(const wxChar
* name
)
2004 m_CodePage
= wxCharsetToCodepage(name
);
2005 m_minMBCharWidth
= 0;
2008 wxMBConv_win32(wxFontEncoding encoding
)
2010 m_CodePage
= wxEncodingToCodepage(encoding
);
2011 m_minMBCharWidth
= 0;
2013 #endif // wxUSE_FONTMAP
2015 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2017 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2018 // the behaviour is not compatible with the Unix version (using iconv)
2019 // and break the library itself, e.g. wxTextInputStream::NextChar()
2020 // wouldn't work if reading an incomplete MB char didn't result in an
2023 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2024 // Win XP or newer and it is not supported for UTF-[78] so we always
2025 // use our own conversions in this case. See
2026 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2027 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2028 if ( m_CodePage
== CP_UTF8
)
2030 return wxConvUTF8
.MB2WC(buf
, psz
, n
);
2033 if ( m_CodePage
== CP_UTF7
)
2035 return wxConvUTF7
.MB2WC(buf
, psz
, n
);
2039 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2040 IsAtLeastWin2kSP4() )
2042 flags
= MB_ERR_INVALID_CHARS
;
2045 const size_t len
= ::MultiByteToWideChar
2047 m_CodePage
, // code page
2048 flags
, // flags: fall on error
2049 psz
, // input string
2050 -1, // its length (NUL-terminated)
2051 buf
, // output string
2052 buf
? n
: 0 // size of output buffer
2056 // function totally failed
2057 return wxCONV_FAILED
;
2060 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2061 // check if we succeeded, by doing a double trip:
2062 if ( !flags
&& buf
)
2064 const size_t mbLen
= strlen(psz
);
2065 wxCharBuffer
mbBuf(mbLen
);
2066 if ( ::WideCharToMultiByte
2073 mbLen
+ 1, // size in bytes, not length
2077 strcmp(mbBuf
, psz
) != 0 )
2079 // we didn't obtain the same thing we started from, hence
2080 // the conversion was lossy and we consider that it failed
2081 return wxCONV_FAILED
;
2085 // note that it returns count of written chars for buf != NULL and size
2086 // of the needed buffer for buf == NULL so in either case the length of
2087 // the string (which never includes the terminating NUL) is one less
2091 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2094 we have a problem here: by default, WideCharToMultiByte() may
2095 replace characters unrepresentable in the target code page with bad
2096 quality approximations such as turning "1/2" symbol (U+00BD) into
2097 "1" for the code pages which don't have it and we, obviously, want
2098 to avoid this at any price
2100 the trouble is that this function does it _silently_, i.e. it won't
2101 even tell us whether it did or not... Win98/2000 and higher provide
2102 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2103 we have to resort to a round trip, i.e. check that converting back
2104 results in the same string -- this is, of course, expensive but
2105 otherwise we simply can't be sure to not garble the data.
2108 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2109 // it doesn't work with CJK encodings (which we test for rather roughly
2110 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2112 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2115 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2117 // it's our lucky day
2118 flags
= WC_NO_BEST_FIT_CHARS
;
2119 pUsedDef
= &usedDef
;
2121 else // old system or unsupported encoding
2127 const size_t len
= ::WideCharToMultiByte
2129 m_CodePage
, // code page
2130 flags
, // either none or no best fit
2131 pwz
, // input string
2132 -1, // it is (wide) NUL-terminated
2133 buf
, // output buffer
2134 buf
? n
: 0, // and its size
2135 NULL
, // default "replacement" char
2136 pUsedDef
// [out] was it used?
2141 // function totally failed
2142 return wxCONV_FAILED
;
2145 // if we were really converting, check if we succeeded
2150 // check if the conversion failed, i.e. if any replacements
2153 return wxCONV_FAILED
;
2155 else // we must resort to double tripping...
2157 wxWCharBuffer
wcBuf(n
);
2158 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2159 wcscmp(wcBuf
, pwz
) != 0 )
2161 // we didn't obtain the same thing we started from, hence
2162 // the conversion was lossy and we consider that it failed
2163 return wxCONV_FAILED
;
2168 // see the comment above for the reason of "len - 1"
2172 virtual size_t GetMBNulLen() const
2174 if ( m_minMBCharWidth
== 0 )
2176 int len
= ::WideCharToMultiByte
2178 m_CodePage
, // code page
2180 L
"", // input string
2181 1, // translate just the NUL
2182 NULL
, // output buffer
2184 NULL
, // no replacement char
2185 NULL
// [out] don't care if it was used
2188 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2192 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2193 self
->m_minMBCharWidth
= (size_t)-1;
2197 self
->m_minMBCharWidth
= (size_t)-1;
2203 self
->m_minMBCharWidth
= len
;
2208 return m_minMBCharWidth
;
2211 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2213 bool IsOk() const { return m_CodePage
!= -1; }
2216 static bool CanUseNoBestFit()
2218 static int s_isWin98Or2k
= -1;
2220 if ( s_isWin98Or2k
== -1 )
2223 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2226 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2230 s_isWin98Or2k
= verMaj
>= 5;
2234 // unknown: be conservative by default
2239 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2242 return s_isWin98Or2k
== 1;
2245 static bool IsAtLeastWin2kSP4()
2250 static int s_isAtLeastWin2kSP4
= -1;
2252 if ( s_isAtLeastWin2kSP4
== -1 )
2254 OSVERSIONINFOEX ver
;
2256 memset(&ver
, 0, sizeof(ver
));
2257 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2258 GetVersionEx((OSVERSIONINFO
*)&ver
);
2260 s_isAtLeastWin2kSP4
=
2261 ((ver
.dwMajorVersion
> 5) || // Vista+
2262 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2263 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2264 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2268 return s_isAtLeastWin2kSP4
== 1;
2273 // the code page we're working with
2276 // cached result of GetMBNulLen(), set to 0 initially meaning
2278 size_t m_minMBCharWidth
;
2281 #endif // wxHAVE_WIN32_MB2WC
2283 // ============================================================================
2284 // Cocoa conversion classes
2285 // ============================================================================
2287 #if defined(__WXCOCOA__)
2289 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2290 // Strangely enough, internally Core Foundation uses
2291 // UTF-32 internally quite a bit - its just not public (yet).
2293 #include <CoreFoundation/CFString.h>
2294 #include <CoreFoundation/CFStringEncodingExt.h>
2296 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2298 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2302 case wxFONTENCODING_DEFAULT
:
2303 enc
= CFStringGetSystemEncoding();
2306 case wxFONTENCODING_ISO8859_1
:
2307 enc
= kCFStringEncodingISOLatin1
;
2309 case wxFONTENCODING_ISO8859_2
:
2310 enc
= kCFStringEncodingISOLatin2
;
2312 case wxFONTENCODING_ISO8859_3
:
2313 enc
= kCFStringEncodingISOLatin3
;
2315 case wxFONTENCODING_ISO8859_4
:
2316 enc
= kCFStringEncodingISOLatin4
;
2318 case wxFONTENCODING_ISO8859_5
:
2319 enc
= kCFStringEncodingISOLatinCyrillic
;
2321 case wxFONTENCODING_ISO8859_6
:
2322 enc
= kCFStringEncodingISOLatinArabic
;
2324 case wxFONTENCODING_ISO8859_7
:
2325 enc
= kCFStringEncodingISOLatinGreek
;
2327 case wxFONTENCODING_ISO8859_8
:
2328 enc
= kCFStringEncodingISOLatinHebrew
;
2330 case wxFONTENCODING_ISO8859_9
:
2331 enc
= kCFStringEncodingISOLatin5
;
2333 case wxFONTENCODING_ISO8859_10
:
2334 enc
= kCFStringEncodingISOLatin6
;
2336 case wxFONTENCODING_ISO8859_11
:
2337 enc
= kCFStringEncodingISOLatinThai
;
2339 case wxFONTENCODING_ISO8859_13
:
2340 enc
= kCFStringEncodingISOLatin7
;
2342 case wxFONTENCODING_ISO8859_14
:
2343 enc
= kCFStringEncodingISOLatin8
;
2345 case wxFONTENCODING_ISO8859_15
:
2346 enc
= kCFStringEncodingISOLatin9
;
2349 case wxFONTENCODING_KOI8
:
2350 enc
= kCFStringEncodingKOI8_R
;
2352 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2353 enc
= kCFStringEncodingDOSRussian
;
2356 // case wxFONTENCODING_BULGARIAN :
2360 case wxFONTENCODING_CP437
:
2361 enc
= kCFStringEncodingDOSLatinUS
;
2363 case wxFONTENCODING_CP850
:
2364 enc
= kCFStringEncodingDOSLatin1
;
2366 case wxFONTENCODING_CP852
:
2367 enc
= kCFStringEncodingDOSLatin2
;
2369 case wxFONTENCODING_CP855
:
2370 enc
= kCFStringEncodingDOSCyrillic
;
2372 case wxFONTENCODING_CP866
:
2373 enc
= kCFStringEncodingDOSRussian
;
2375 case wxFONTENCODING_CP874
:
2376 enc
= kCFStringEncodingDOSThai
;
2378 case wxFONTENCODING_CP932
:
2379 enc
= kCFStringEncodingDOSJapanese
;
2381 case wxFONTENCODING_CP936
:
2382 enc
= kCFStringEncodingDOSChineseSimplif
;
2384 case wxFONTENCODING_CP949
:
2385 enc
= kCFStringEncodingDOSKorean
;
2387 case wxFONTENCODING_CP950
:
2388 enc
= kCFStringEncodingDOSChineseTrad
;
2390 case wxFONTENCODING_CP1250
:
2391 enc
= kCFStringEncodingWindowsLatin2
;
2393 case wxFONTENCODING_CP1251
:
2394 enc
= kCFStringEncodingWindowsCyrillic
;
2396 case wxFONTENCODING_CP1252
:
2397 enc
= kCFStringEncodingWindowsLatin1
;
2399 case wxFONTENCODING_CP1253
:
2400 enc
= kCFStringEncodingWindowsGreek
;
2402 case wxFONTENCODING_CP1254
:
2403 enc
= kCFStringEncodingWindowsLatin5
;
2405 case wxFONTENCODING_CP1255
:
2406 enc
= kCFStringEncodingWindowsHebrew
;
2408 case wxFONTENCODING_CP1256
:
2409 enc
= kCFStringEncodingWindowsArabic
;
2411 case wxFONTENCODING_CP1257
:
2412 enc
= kCFStringEncodingWindowsBalticRim
;
2414 // This only really encodes to UTF7 (if that) evidently
2415 // case wxFONTENCODING_UTF7 :
2416 // enc = kCFStringEncodingNonLossyASCII ;
2418 case wxFONTENCODING_UTF8
:
2419 enc
= kCFStringEncodingUTF8
;
2421 case wxFONTENCODING_EUC_JP
:
2422 enc
= kCFStringEncodingEUC_JP
;
2424 case wxFONTENCODING_UTF16
:
2425 enc
= kCFStringEncodingUnicode
;
2427 case wxFONTENCODING_MACROMAN
:
2428 enc
= kCFStringEncodingMacRoman
;
2430 case wxFONTENCODING_MACJAPANESE
:
2431 enc
= kCFStringEncodingMacJapanese
;
2433 case wxFONTENCODING_MACCHINESETRAD
:
2434 enc
= kCFStringEncodingMacChineseTrad
;
2436 case wxFONTENCODING_MACKOREAN
:
2437 enc
= kCFStringEncodingMacKorean
;
2439 case wxFONTENCODING_MACARABIC
:
2440 enc
= kCFStringEncodingMacArabic
;
2442 case wxFONTENCODING_MACHEBREW
:
2443 enc
= kCFStringEncodingMacHebrew
;
2445 case wxFONTENCODING_MACGREEK
:
2446 enc
= kCFStringEncodingMacGreek
;
2448 case wxFONTENCODING_MACCYRILLIC
:
2449 enc
= kCFStringEncodingMacCyrillic
;
2451 case wxFONTENCODING_MACDEVANAGARI
:
2452 enc
= kCFStringEncodingMacDevanagari
;
2454 case wxFONTENCODING_MACGURMUKHI
:
2455 enc
= kCFStringEncodingMacGurmukhi
;
2457 case wxFONTENCODING_MACGUJARATI
:
2458 enc
= kCFStringEncodingMacGujarati
;
2460 case wxFONTENCODING_MACORIYA
:
2461 enc
= kCFStringEncodingMacOriya
;
2463 case wxFONTENCODING_MACBENGALI
:
2464 enc
= kCFStringEncodingMacBengali
;
2466 case wxFONTENCODING_MACTAMIL
:
2467 enc
= kCFStringEncodingMacTamil
;
2469 case wxFONTENCODING_MACTELUGU
:
2470 enc
= kCFStringEncodingMacTelugu
;
2472 case wxFONTENCODING_MACKANNADA
:
2473 enc
= kCFStringEncodingMacKannada
;
2475 case wxFONTENCODING_MACMALAJALAM
:
2476 enc
= kCFStringEncodingMacMalayalam
;
2478 case wxFONTENCODING_MACSINHALESE
:
2479 enc
= kCFStringEncodingMacSinhalese
;
2481 case wxFONTENCODING_MACBURMESE
:
2482 enc
= kCFStringEncodingMacBurmese
;
2484 case wxFONTENCODING_MACKHMER
:
2485 enc
= kCFStringEncodingMacKhmer
;
2487 case wxFONTENCODING_MACTHAI
:
2488 enc
= kCFStringEncodingMacThai
;
2490 case wxFONTENCODING_MACLAOTIAN
:
2491 enc
= kCFStringEncodingMacLaotian
;
2493 case wxFONTENCODING_MACGEORGIAN
:
2494 enc
= kCFStringEncodingMacGeorgian
;
2496 case wxFONTENCODING_MACARMENIAN
:
2497 enc
= kCFStringEncodingMacArmenian
;
2499 case wxFONTENCODING_MACCHINESESIMP
:
2500 enc
= kCFStringEncodingMacChineseSimp
;
2502 case wxFONTENCODING_MACTIBETAN
:
2503 enc
= kCFStringEncodingMacTibetan
;
2505 case wxFONTENCODING_MACMONGOLIAN
:
2506 enc
= kCFStringEncodingMacMongolian
;
2508 case wxFONTENCODING_MACETHIOPIC
:
2509 enc
= kCFStringEncodingMacEthiopic
;
2511 case wxFONTENCODING_MACCENTRALEUR
:
2512 enc
= kCFStringEncodingMacCentralEurRoman
;
2514 case wxFONTENCODING_MACVIATNAMESE
:
2515 enc
= kCFStringEncodingMacVietnamese
;
2517 case wxFONTENCODING_MACARABICEXT
:
2518 enc
= kCFStringEncodingMacExtArabic
;
2520 case wxFONTENCODING_MACSYMBOL
:
2521 enc
= kCFStringEncodingMacSymbol
;
2523 case wxFONTENCODING_MACDINGBATS
:
2524 enc
= kCFStringEncodingMacDingbats
;
2526 case wxFONTENCODING_MACTURKISH
:
2527 enc
= kCFStringEncodingMacTurkish
;
2529 case wxFONTENCODING_MACCROATIAN
:
2530 enc
= kCFStringEncodingMacCroatian
;
2532 case wxFONTENCODING_MACICELANDIC
:
2533 enc
= kCFStringEncodingMacIcelandic
;
2535 case wxFONTENCODING_MACROMANIAN
:
2536 enc
= kCFStringEncodingMacRomanian
;
2538 case wxFONTENCODING_MACCELTIC
:
2539 enc
= kCFStringEncodingMacCeltic
;
2541 case wxFONTENCODING_MACGAELIC
:
2542 enc
= kCFStringEncodingMacGaelic
;
2544 // case wxFONTENCODING_MACKEYBOARD :
2545 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2549 // because gcc is picky
2556 class wxMBConv_cocoa
: public wxMBConv
2561 Init(CFStringGetSystemEncoding()) ;
2564 wxMBConv_cocoa(const wxMBConv_cocoa
& conv
)
2566 m_encoding
= conv
.m_encoding
;
2570 wxMBConv_cocoa(const wxChar
* name
)
2572 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2576 wxMBConv_cocoa(wxFontEncoding encoding
)
2578 Init( wxCFStringEncFromFontEnc(encoding
) );
2585 void Init( CFStringEncoding encoding
)
2587 m_encoding
= encoding
;
2590 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2594 CFStringRef theString
= CFStringCreateWithBytes (
2595 NULL
, //the allocator
2596 (const UInt8
*)szUnConv
,
2599 false //no BOM/external representation
2602 wxASSERT(theString
);
2604 size_t nOutLength
= CFStringGetLength(theString
);
2608 CFRelease(theString
);
2612 CFRange theRange
= { 0, nOutSize
};
2614 #if SIZEOF_WCHAR_T == 4
2615 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2618 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2620 CFRelease(theString
);
2622 szUniCharBuffer
[nOutLength
] = '\0';
2624 #if SIZEOF_WCHAR_T == 4
2625 wxMBConvUTF16 converter
;
2626 converter
.MB2WC( szOut
, (const char*)szUniCharBuffer
, nOutSize
);
2627 delete [] szUniCharBuffer
;
2633 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2637 size_t nRealOutSize
;
2638 size_t nBufSize
= wxWcslen(szUnConv
);
2639 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2641 #if SIZEOF_WCHAR_T == 4
2642 wxMBConvUTF16 converter
;
2643 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2644 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1];
2645 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
));
2646 nBufSize
/= sizeof(UniChar
);
2649 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2653 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2656 wxASSERT(theString
);
2658 //Note that CER puts a BOM when converting to unicode
2659 //so we check and use getchars instead in that case
2660 if (m_encoding
== kCFStringEncodingUnicode
)
2663 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2665 nRealOutSize
= CFStringGetLength(theString
) + 1;
2671 CFRangeMake(0, CFStringGetLength(theString
)),
2673 0, //what to put in characters that can't be converted -
2674 //0 tells CFString to return NULL if it meets such a character
2675 false, //not an external representation
2678 (CFIndex
*) &nRealOutSize
2682 CFRelease(theString
);
2684 #if SIZEOF_WCHAR_T == 4
2685 delete[] szUniBuffer
;
2688 return nRealOutSize
- 1;
2691 virtual wxMBConv
*Clone() const { return new wxMBConv_cocoa(*this); }
2695 return m_encoding
!= kCFStringEncodingInvalidId
&&
2696 CFStringIsEncodingAvailable(m_encoding
);
2700 CFStringEncoding m_encoding
;
2703 #endif // defined(__WXCOCOA__)
2705 // ============================================================================
2706 // Mac conversion classes
2707 // ============================================================================
2709 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2711 class wxMBConv_mac
: public wxMBConv
2716 Init(CFStringGetSystemEncoding()) ;
2719 wxMBConv_mac(const wxMBConv_mac
& conv
)
2721 Init(conv
.m_char_encoding
);
2725 wxMBConv_mac(const wxChar
* name
)
2727 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) );
2731 wxMBConv_mac(wxFontEncoding encoding
)
2733 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2738 OSStatus status
= noErr
;
2739 status
= TECDisposeConverter(m_MB2WC_converter
);
2740 status
= TECDisposeConverter(m_WC2MB_converter
);
2744 void Init( TextEncodingBase encoding
)
2746 OSStatus status
= noErr
;
2747 m_char_encoding
= encoding
;
2748 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
, 0, kUnicode16BitFormat
) ;
2750 status
= TECCreateConverter(&m_MB2WC_converter
,
2752 m_unicode_encoding
);
2753 status
= TECCreateConverter(&m_WC2MB_converter
,
2758 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2760 OSStatus status
= noErr
;
2761 ByteCount byteOutLen
;
2762 ByteCount byteInLen
= strlen(psz
) ;
2763 wchar_t *tbuf
= NULL
;
2764 UniChar
* ubuf
= NULL
;
2769 // Apple specs say at least 32
2770 n
= wxMax( 32, byteInLen
) ;
2771 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2774 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2776 #if SIZEOF_WCHAR_T == 4
2777 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2779 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2782 status
= TECConvertText(
2783 m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2784 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2786 #if SIZEOF_WCHAR_T == 4
2787 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2788 // is not properly terminated we get random characters at the end
2789 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2790 wxMBConvUTF16 converter
;
2791 res
= converter
.MB2WC( (buf
? buf
: tbuf
), (const char*)ubuf
, n
) ;
2794 res
= byteOutLen
/ sizeof( UniChar
) ;
2800 if ( buf
&& res
< n
)
2806 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2808 OSStatus status
= noErr
;
2809 ByteCount byteOutLen
;
2810 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2816 // Apple specs say at least 32
2817 n
= wxMax( 32, ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2818 tbuf
= (char*) malloc( n
) ;
2821 ByteCount byteBufferLen
= n
;
2822 UniChar
* ubuf
= NULL
;
2824 #if SIZEOF_WCHAR_T == 4
2825 wxMBConvUTF16 converter
;
2826 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2827 byteInLen
= unicharlen
;
2828 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2829 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2831 ubuf
= (UniChar
*) psz
;
2834 status
= TECConvertText(
2835 m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2836 (TextPtr
) (buf
? buf
: tbuf
), byteBufferLen
, &byteOutLen
);
2838 #if SIZEOF_WCHAR_T == 4
2845 size_t res
= byteOutLen
;
2846 if ( buf
&& res
< n
)
2850 //we need to double-trip to verify it didn't insert any ? in place
2851 //of bogus characters
2852 wxWCharBuffer
wcBuf(n
);
2853 size_t pszlen
= wxWcslen(psz
);
2854 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2855 wxWcslen(wcBuf
) != pszlen
||
2856 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2858 // we didn't obtain the same thing we started from, hence
2859 // the conversion was lossy and we consider that it failed
2860 return wxCONV_FAILED
;
2867 virtual wxMBConv
*Clone() const { return new wxMBConv_mac(*this); }
2870 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2873 TECObjectRef m_MB2WC_converter
;
2874 TECObjectRef m_WC2MB_converter
;
2876 TextEncodingBase m_char_encoding
;
2877 TextEncodingBase m_unicode_encoding
;
2880 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2882 // ============================================================================
2883 // wxEncodingConverter based conversion classes
2884 // ============================================================================
2888 class wxMBConv_wxwin
: public wxMBConv
2893 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2894 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2898 // temporarily just use wxEncodingConverter stuff,
2899 // so that it works while a better implementation is built
2900 wxMBConv_wxwin(const wxChar
* name
)
2903 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2905 m_enc
= wxFONTENCODING_SYSTEM
;
2910 wxMBConv_wxwin(wxFontEncoding enc
)
2917 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2919 size_t inbuf
= strlen(psz
);
2922 if (!m2w
.Convert(psz
, buf
))
2923 return wxCONV_FAILED
;
2928 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2930 const size_t inbuf
= wxWcslen(psz
);
2933 if (!w2m
.Convert(psz
, buf
))
2934 return wxCONV_FAILED
;
2940 virtual size_t GetMBNulLen() const
2944 case wxFONTENCODING_UTF16BE
:
2945 case wxFONTENCODING_UTF16LE
:
2948 case wxFONTENCODING_UTF32BE
:
2949 case wxFONTENCODING_UTF32LE
:
2957 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2959 bool IsOk() const { return m_ok
; }
2962 wxFontEncoding m_enc
;
2963 wxEncodingConverter m2w
, w2m
;
2966 // were we initialized successfully?
2969 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2972 // make the constructors available for unit testing
2973 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2975 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2976 if ( !result
->IsOk() )
2985 #endif // wxUSE_FONTMAP
2987 // ============================================================================
2988 // wxCSConv implementation
2989 // ============================================================================
2991 void wxCSConv::Init()
2998 wxCSConv::wxCSConv(const wxChar
*charset
)
3008 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
3010 m_encoding
= wxFONTENCODING_SYSTEM
;
3014 wxCSConv::wxCSConv(wxFontEncoding encoding
)
3016 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
3018 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3020 encoding
= wxFONTENCODING_SYSTEM
;
3025 m_encoding
= encoding
;
3028 wxCSConv::~wxCSConv()
3033 wxCSConv::wxCSConv(const wxCSConv
& conv
)
3038 SetName(conv
.m_name
);
3039 m_encoding
= conv
.m_encoding
;
3042 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
3046 SetName(conv
.m_name
);
3047 m_encoding
= conv
.m_encoding
;
3052 void wxCSConv::Clear()
3061 void wxCSConv::SetName(const wxChar
*charset
)
3065 m_name
= wxStrdup(charset
);
3071 #include "wx/hashmap.h"
3073 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3074 wxEncodingNameCache
);
3076 static wxEncodingNameCache gs_nameCache
;
3079 wxMBConv
*wxCSConv::DoCreate() const
3082 wxLogTrace(TRACE_STRCONV
,
3083 wxT("creating conversion for %s"),
3085 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
3086 #endif // wxUSE_FONTMAP
3088 // check for the special case of ASCII or ISO8859-1 charset: as we have
3089 // special knowledge of it anyhow, we don't need to create a special
3090 // conversion object
3091 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
3092 m_encoding
== wxFONTENCODING_DEFAULT
)
3094 // don't convert at all
3098 // we trust OS to do conversion better than we can so try external
3099 // conversion methods first
3101 // the full order is:
3102 // 1. OS conversion (iconv() under Unix or Win32 API)
3103 // 2. hard coded conversions for UTF
3104 // 3. wxEncodingConverter as fall back
3110 #endif // !wxUSE_FONTMAP
3112 wxString
name(m_name
);
3113 wxFontEncoding
encoding(m_encoding
);
3115 if ( !name
.empty() )
3117 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
3125 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
3126 #endif // wxUSE_FONTMAP
3130 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3131 if ( it
!= gs_nameCache
.end() )
3133 if ( it
->second
.empty() )
3136 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
3143 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3145 for ( ; *names
; ++names
)
3147 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
3150 gs_nameCache
[encoding
] = *names
;
3157 gs_nameCache
[encoding
] = _T(""); // cache the failure
3159 #endif // wxUSE_FONTMAP
3161 #endif // HAVE_ICONV
3163 #ifdef wxHAVE_WIN32_MB2WC
3166 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3167 : new wxMBConv_win32(m_encoding
);
3176 #endif // wxHAVE_WIN32_MB2WC
3178 #if defined(__WXMAC__)
3180 // leave UTF16 and UTF32 to the built-ins of wx
3181 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3182 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3185 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
3186 : new wxMBConv_mac(m_encoding
);
3188 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
3198 #if defined(__WXCOCOA__)
3200 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
3203 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
3204 : new wxMBConv_cocoa(m_encoding
);
3206 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
3217 wxFontEncoding enc
= m_encoding
;
3219 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3221 // use "false" to suppress interactive dialogs -- we can be called from
3222 // anywhere and popping up a dialog from here is the last thing we want to
3224 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3226 #endif // wxUSE_FONTMAP
3230 case wxFONTENCODING_UTF7
:
3231 return new wxMBConvUTF7
;
3233 case wxFONTENCODING_UTF8
:
3234 return new wxMBConvUTF8
;
3236 case wxFONTENCODING_UTF16BE
:
3237 return new wxMBConvUTF16BE
;
3239 case wxFONTENCODING_UTF16LE
:
3240 return new wxMBConvUTF16LE
;
3242 case wxFONTENCODING_UTF32BE
:
3243 return new wxMBConvUTF32BE
;
3245 case wxFONTENCODING_UTF32LE
:
3246 return new wxMBConvUTF32LE
;
3249 // nothing to do but put here to suppress gcc warnings
3256 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3257 : new wxMBConv_wxwin(m_encoding
);
3263 #endif // wxUSE_FONTMAP
3265 // NB: This is a hack to prevent deadlock. What could otherwise happen
3266 // in Unicode build: wxConvLocal creation ends up being here
3267 // because of some failure and logs the error. But wxLog will try to
3268 // attach timestamp, for which it will need wxConvLocal (to convert
3269 // time to char* and then wchar_t*), but that fails, tries to log
3270 // error, but wxLog has a (already locked) critical section that
3271 // guards static buffer.
3272 static bool alreadyLoggingError
= false;
3273 if (!alreadyLoggingError
)
3275 alreadyLoggingError
= true;
3276 wxLogError(_("Cannot convert from the charset '%s'!"),
3280 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
3281 #else // !wxUSE_FONTMAP
3282 wxString::Format(_("encoding %s"), m_encoding
).c_str()
3283 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3286 alreadyLoggingError
= false;
3292 void wxCSConv::CreateConvIfNeeded() const
3296 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3299 // if we don't have neither the name nor the encoding, use the default
3300 // encoding for this system
3301 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3303 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
3305 #endif // wxUSE_INTL
3307 self
->m_convReal
= DoCreate();
3308 self
->m_deferred
= false;
3312 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3314 CreateConvIfNeeded();
3317 return m_convReal
->MB2WC(buf
, psz
, n
);
3320 size_t len
= strlen(psz
);
3324 for (size_t c
= 0; c
<= len
; c
++)
3325 buf
[c
] = (unsigned char)(psz
[c
]);
3331 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3333 CreateConvIfNeeded();
3336 return m_convReal
->WC2MB(buf
, psz
, n
);
3339 const size_t len
= wxWcslen(psz
);
3342 for (size_t c
= 0; c
<= len
; c
++)
3345 return wxCONV_FAILED
;
3347 buf
[c
] = (char)psz
[c
];
3352 for (size_t c
= 0; c
<= len
; c
++)
3355 return wxCONV_FAILED
;
3362 size_t wxCSConv::GetMBNulLen() const
3364 CreateConvIfNeeded();
3368 return m_convReal
->GetMBNulLen();
3374 // ----------------------------------------------------------------------------
3376 // ----------------------------------------------------------------------------
3379 static wxMBConv_win32 wxConvLibcObj
;
3380 #elif defined(__WXMAC__) && !defined(__MACH__)
3381 static wxMBConv_mac wxConvLibcObj
;
3383 static wxMBConvLibc wxConvLibcObj
;
3386 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3387 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3388 static wxMBConvUTF7 wxConvUTF7Obj
;
3389 static wxMBConvUTF8 wxConvUTF8Obj
;
3391 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3392 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3393 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3394 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3395 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3396 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3397 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3405 #else // !wxUSE_WCHAR_T
3407 // stand-ins in absence of wchar_t
3408 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3413 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T