1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
24 #include "wx/strconv.h"
29 #include "wx/msw/private.h"
30 #include "wx/msw/missing.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #define wxHAVE_WIN32_MB2WC
51 #include "wx/thread.h"
54 #include "wx/encconv.h"
55 #include "wx/fontmap.h"
59 #include <ATSUnicode.h>
60 #include <TextCommon.h>
61 #include <TextEncodingConverter.h>
64 // includes Mac headers
65 #include "wx/mac/private.h"
69 #define TRACE_STRCONV _T("strconv")
71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
73 #if SIZEOF_WCHAR_T == 2
78 // ============================================================================
80 // ============================================================================
82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
83 static bool NotAllNULs(const char *p
, size_t n
)
85 while ( n
&& *p
++ == '\0' )
91 // ----------------------------------------------------------------------------
92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
93 // ----------------------------------------------------------------------------
95 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
100 *output
= (wxUint16
) input
;
104 else if (input
>= 0x110000)
106 return wxCONV_FAILED
;
112 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
113 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
120 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
122 if ((*input
< 0xd800) || (*input
> 0xdfff))
127 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
130 return wxCONV_FAILED
;
134 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
140 typedef wchar_t wxDecodeSurrogate_t
;
142 typedef wxUint16 wxDecodeSurrogate_t
;
143 #endif // WC_UTF16/!WC_UTF16
145 // returns the next UTF-32 character from the wchar_t buffer and advances the
146 // pointer to the character after this one
148 // if an invalid character is found, *pSrc is set to NULL, the caller must
150 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
154 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
155 if ( n
== wxCONV_FAILED
)
163 // ----------------------------------------------------------------------------
165 // ----------------------------------------------------------------------------
168 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
169 const char *src
, size_t srcLen
) const
171 // although new conversion classes are supposed to implement this function
172 // directly, the existins ones only implement the old MB2WC() and so, to
173 // avoid to have to rewrite all conversion classes at once, we provide a
174 // default (but not efficient) implementation of this one in terms of the
175 // old function by copying the input to ensure that it's NUL-terminated and
176 // then using MB2WC() to convert it
178 // the number of chars [which would be] written to dst [if it were not NULL]
179 size_t dstWritten
= 0;
181 // the number of NULs terminating this string
182 size_t nulLen
= 0; // not really needed, but just to avoid warnings
184 // if we were not given the input size we just have to assume that the
185 // string is properly terminated as we have no way of knowing how long it
186 // is anyhow, but if we do have the size check whether there are enough
190 if ( srcLen
!= wxNO_LEN
)
192 // we need to know how to find the end of this string
193 nulLen
= GetMBNulLen();
194 if ( nulLen
== wxCONV_FAILED
)
195 return wxCONV_FAILED
;
197 // if there are enough NULs we can avoid the copy
198 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
200 // make a copy in order to properly NUL-terminate the string
201 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
202 char * const p
= bufTmp
.data();
203 memcpy(p
, src
, srcLen
);
204 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
210 srcEnd
= src
+ srcLen
;
212 else // quit after the first loop iteration
219 // try to convert the current chunk
220 size_t lenChunk
= MB2WC(NULL
, src
, 0);
221 if ( lenChunk
== wxCONV_FAILED
)
222 return wxCONV_FAILED
;
224 lenChunk
++; // for the L'\0' at the end of this chunk
226 dstWritten
+= lenChunk
;
230 // nothing left in the input string, conversion succeeded
236 if ( dstWritten
> dstLen
)
237 return wxCONV_FAILED
;
239 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
240 return wxCONV_FAILED
;
247 // we convert just one chunk in this case as this is the entire
252 // advance the input pointer past the end of this chunk
253 while ( NotAllNULs(src
, nulLen
) )
255 // notice that we must skip over multiple bytes here as we suppose
256 // that if NUL takes 2 or 4 bytes, then all the other characters do
257 // too and so if advanced by a single byte we might erroneously
258 // detect sequences of NUL bytes in the middle of the input
262 src
+= nulLen
; // skipping over its terminator as well
264 // note that ">=" (and not just "==") is needed here as the terminator
265 // we skipped just above could be inside or just after the buffer
266 // delimited by inEnd
275 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
276 const wchar_t *src
, size_t srcLen
) const
278 // the number of chars [which would be] written to dst [if it were not NULL]
279 size_t dstWritten
= 0;
281 // make a copy of the input string unless it is already properly
284 // if we don't know its length we have no choice but to assume that it is,
285 // indeed, properly terminated
286 wxWCharBuffer bufTmp
;
287 if ( srcLen
== wxNO_LEN
)
289 srcLen
= wxWcslen(src
) + 1;
291 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
293 // make a copy in order to properly NUL-terminate the string
294 bufTmp
= wxWCharBuffer(srcLen
);
295 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
299 const size_t lenNul
= GetMBNulLen();
300 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
302 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
304 // try to convert the current chunk
305 size_t lenChunk
= WC2MB(NULL
, src
, 0);
307 if ( lenChunk
== wxCONV_FAILED
)
308 return wxCONV_FAILED
;
311 dstWritten
+= lenChunk
;
315 if ( dstWritten
> dstLen
)
316 return wxCONV_FAILED
;
318 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
319 return wxCONV_FAILED
;
328 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
330 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
331 if ( rc
!= wxCONV_FAILED
)
333 // ToWChar() returns the buffer length, i.e. including the trailing
334 // NUL, while this method doesn't take it into account
341 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
343 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
344 if ( rc
!= wxCONV_FAILED
)
352 wxMBConv::~wxMBConv()
354 // nothing to do here (necessary for Darwin linking probably)
357 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
361 // calculate the length of the buffer needed first
362 const size_t nLen
= MB2WC(NULL
, psz
, 0);
363 if ( nLen
!= wxCONV_FAILED
)
365 // now do the actual conversion
366 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
368 // +1 for the trailing NULL
369 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
374 return wxWCharBuffer();
377 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
381 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
382 if ( nLen
!= wxCONV_FAILED
)
384 // extra space for trailing NUL(s)
385 static const size_t extraLen
= GetMaxMBNulLen();
387 wxCharBuffer
buf(nLen
+ extraLen
- 1);
388 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
393 return wxCharBuffer();
397 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
399 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
400 if ( dstLen
!= wxCONV_FAILED
)
402 wxWCharBuffer
wbuf(dstLen
- 1);
403 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
408 if ( wbuf
[dstLen
- 1] == L
'\0' )
419 return wxWCharBuffer();
423 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
425 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
426 if ( dstLen
!= wxCONV_FAILED
)
430 // special case: can't allocate 0 size buffer below
434 wxCharBuffer
buf(dstLen
- 1);
435 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
441 const size_t nulLen
= GetMBNulLen();
442 if ( dstLen
>= nulLen
&&
443 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
445 // in this case the output is NUL-terminated and we're not
446 // supposed to count NUL
458 return wxCharBuffer();
461 // ----------------------------------------------------------------------------
463 // ----------------------------------------------------------------------------
465 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
467 return wxMB2WC(buf
, psz
, n
);
470 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
472 return wxWC2MB(buf
, psz
, n
);
475 // ----------------------------------------------------------------------------
476 // wxConvBrokenFileNames
477 // ----------------------------------------------------------------------------
481 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar
*charset
)
483 if ( !charset
|| wxStricmp(charset
, _T("UTF-8")) == 0
484 || wxStricmp(charset
, _T("UTF8")) == 0 )
485 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
);
487 m_conv
= new wxCSConv(charset
);
492 // ----------------------------------------------------------------------------
494 // ----------------------------------------------------------------------------
496 // Implementation (C) 2004 Fredrik Roubert
499 // BASE64 decoding table
501 static const unsigned char utf7unb64
[] =
503 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
506 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
509 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
510 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
512 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
513 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
514 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
516 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
517 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
518 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
537 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
541 while ( *psz
&& (!buf
|| (len
< n
)) )
543 unsigned char cc
= *psz
++;
551 else if (*psz
== '-')
559 else // start of BASE64 encoded string
563 for ( ok
= lsb
= false, d
= 0, l
= 0;
564 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
569 for (l
+= 6; l
>= 8; lsb
= !lsb
)
571 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
581 *buf
= (wchar_t)(c
<< 8);
590 // in valid UTF7 we should have valid characters after '+'
591 return wxCONV_FAILED
;
599 if ( buf
&& (len
< n
) )
606 // BASE64 encoding table
608 static const unsigned char utf7enb64
[] =
610 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
611 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
612 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
613 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
614 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
615 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
616 'w', 'x', 'y', 'z', '0', '1', '2', '3',
617 '4', '5', '6', '7', '8', '9', '+', '/'
621 // UTF-7 encoding table
623 // 0 - Set D (directly encoded characters)
624 // 1 - Set O (optional direct characters)
625 // 2 - whitespace characters (optional)
626 // 3 - special characters
628 static const unsigned char utf7encode
[128] =
630 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
631 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
632 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
634 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
635 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
636 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
640 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
644 while (*psz
&& ((!buf
) || (len
< n
)))
647 if (cc
< 0x80 && utf7encode
[cc
] < 1)
656 else if (((wxUint32
)cc
) > 0xffff)
658 // no surrogate pair generation (yet?)
659 return wxCONV_FAILED
;
670 // BASE64 encode string
671 unsigned int lsb
, d
, l
;
672 for (d
= 0, l
= 0; /*nothing*/; psz
++)
674 for (lsb
= 0; lsb
< 2; lsb
++)
677 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
679 for (l
+= 8; l
>= 6; )
683 *buf
++ = utf7enb64
[(d
>> l
) % 64];
689 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
696 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
708 if (buf
&& (len
< n
))
714 // ----------------------------------------------------------------------------
716 // ----------------------------------------------------------------------------
718 static wxUint32 utf8_max
[]=
719 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
721 // boundaries of the private use area we use to (temporarily) remap invalid
722 // characters invalid in a UTF-8 encoded string
723 const wxUint32 wxUnicodePUA
= 0x100000;
724 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
726 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
730 while (*psz
&& ((!buf
) || (len
< n
)))
732 const char *opsz
= psz
;
733 bool invalid
= false;
734 unsigned char cc
= *psz
++, fc
= cc
;
736 for (cnt
= 0; fc
& 0x80; cnt
++)
746 // escape the escape character for octal escapes
747 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
748 && cc
== '\\' && (!buf
|| len
< n
))
760 // invalid UTF-8 sequence
765 unsigned ocnt
= cnt
- 1;
766 wxUint32 res
= cc
& (0x3f >> cnt
);
770 if ((cc
& 0xC0) != 0x80)
772 // invalid UTF-8 sequence
778 res
= (res
<< 6) | (cc
& 0x3f);
781 if (invalid
|| res
<= utf8_max
[ocnt
])
783 // illegal UTF-8 encoding
786 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
787 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
789 // if one of our PUA characters turns up externally
790 // it must also be treated as an illegal sequence
791 // (a bit like you have to escape an escape character)
797 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
798 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
799 if (pa
== wxCONV_FAILED
)
811 *buf
++ = (wchar_t)res
;
813 #endif // WC_UTF16/!WC_UTF16
819 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
821 while (opsz
< psz
&& (!buf
|| len
< n
))
824 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
825 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
826 wxASSERT(pa
!= wxCONV_FAILED
);
833 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
839 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
841 while (opsz
< psz
&& (!buf
|| len
< n
))
843 if ( buf
&& len
+ 3 < n
)
845 unsigned char on
= *opsz
;
847 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
848 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
849 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
856 else // MAP_INVALID_UTF8_NOT
858 return wxCONV_FAILED
;
864 if (buf
&& (len
< n
))
870 static inline bool isoctal(wchar_t wch
)
872 return L
'0' <= wch
&& wch
<= L
'7';
875 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
879 while (*psz
&& ((!buf
) || (len
< n
)))
884 // cast is ok for WC_UTF16
885 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
886 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
888 cc
= (*psz
++) & 0x7fffffff;
891 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
892 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
895 *buf
++ = (char)(cc
- wxUnicodePUA
);
898 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
899 && cc
== L
'\\' && psz
[0] == L
'\\' )
906 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
908 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
912 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
913 (psz
[1] - L
'0') * 010 +
923 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
939 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
941 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
947 if (buf
&& (len
< n
))
953 // ============================================================================
955 // ============================================================================
957 #ifdef WORDS_BIGENDIAN
958 #define wxMBConvUTF16straight wxMBConvUTF16BE
959 #define wxMBConvUTF16swap wxMBConvUTF16LE
961 #define wxMBConvUTF16swap wxMBConvUTF16BE
962 #define wxMBConvUTF16straight wxMBConvUTF16LE
966 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
968 if ( srcLen
== wxNO_LEN
)
970 // count the number of bytes in input, including the trailing NULs
971 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
972 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
975 srcLen
*= BYTES_PER_CHAR
;
977 else // we already have the length
979 // we can only convert an entire number of UTF-16 characters
980 if ( srcLen
% BYTES_PER_CHAR
)
981 return wxCONV_FAILED
;
987 // case when in-memory representation is UTF-16 too
990 // ----------------------------------------------------------------------------
991 // conversions without endianness change
992 // ----------------------------------------------------------------------------
995 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
996 const char *src
, size_t srcLen
) const
998 // set up the scene for using memcpy() (which is presumably more efficient
999 // than copying the bytes one by one)
1000 srcLen
= GetLength(src
, srcLen
);
1001 if ( srcLen
== wxNO_LEN
)
1002 return wxCONV_FAILED
;
1004 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1007 if ( dstLen
< inLen
)
1008 return wxCONV_FAILED
;
1010 memcpy(dst
, src
, srcLen
);
1017 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1018 const wchar_t *src
, size_t srcLen
) const
1020 if ( srcLen
== wxNO_LEN
)
1021 srcLen
= wxWcslen(src
) + 1;
1023 srcLen
*= BYTES_PER_CHAR
;
1027 if ( dstLen
< srcLen
)
1028 return wxCONV_FAILED
;
1030 memcpy(dst
, src
, srcLen
);
1036 // ----------------------------------------------------------------------------
1037 // endian-reversing conversions
1038 // ----------------------------------------------------------------------------
1041 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1042 const char *src
, size_t srcLen
) const
1044 srcLen
= GetLength(src
, srcLen
);
1045 if ( srcLen
== wxNO_LEN
)
1046 return wxCONV_FAILED
;
1048 srcLen
/= BYTES_PER_CHAR
;
1052 if ( dstLen
< srcLen
)
1053 return wxCONV_FAILED
;
1055 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1056 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1058 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1066 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1067 const wchar_t *src
, size_t srcLen
) const
1069 if ( srcLen
== wxNO_LEN
)
1070 srcLen
= wxWcslen(src
) + 1;
1072 srcLen
*= BYTES_PER_CHAR
;
1076 if ( dstLen
< srcLen
)
1077 return wxCONV_FAILED
;
1079 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1080 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1082 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1089 #else // !WC_UTF16: wchar_t is UTF-32
1091 // ----------------------------------------------------------------------------
1092 // conversions without endianness change
1093 // ----------------------------------------------------------------------------
1096 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1097 const char *src
, size_t srcLen
) const
1099 srcLen
= GetLength(src
, srcLen
);
1100 if ( srcLen
== wxNO_LEN
)
1101 return wxCONV_FAILED
;
1103 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1106 // optimization: return maximal space which could be needed for this
1107 // string even if the real size could be smaller if the buffer contains
1113 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1114 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1116 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1118 return wxCONV_FAILED
;
1120 if ( ++outLen
> dstLen
)
1121 return wxCONV_FAILED
;
1131 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1132 const wchar_t *src
, size_t srcLen
) const
1134 if ( srcLen
== wxNO_LEN
)
1135 srcLen
= wxWcslen(src
) + 1;
1138 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1139 for ( size_t n
= 0; n
< srcLen
; n
++ )
1142 const size_t numChars
= encode_utf16(*src
++, cc
);
1143 if ( numChars
== wxCONV_FAILED
)
1144 return wxCONV_FAILED
;
1146 outLen
+= numChars
* BYTES_PER_CHAR
;
1149 if ( outLen
> dstLen
)
1150 return wxCONV_FAILED
;
1153 if ( numChars
== 2 )
1155 // second character of a surrogate
1164 // ----------------------------------------------------------------------------
1165 // endian-reversing conversions
1166 // ----------------------------------------------------------------------------
1169 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1170 const char *src
, size_t srcLen
) const
1172 srcLen
= GetLength(src
, srcLen
);
1173 if ( srcLen
== wxNO_LEN
)
1174 return wxCONV_FAILED
;
1176 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1179 // optimization: return maximal space which could be needed for this
1180 // string even if the real size could be smaller if the buffer contains
1186 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1187 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1192 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1194 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1196 const size_t numChars
= decode_utf16(tmp
, ch
);
1197 if ( numChars
== wxCONV_FAILED
)
1198 return wxCONV_FAILED
;
1200 if ( numChars
== 2 )
1203 if ( ++outLen
> dstLen
)
1204 return wxCONV_FAILED
;
1214 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1215 const wchar_t *src
, size_t srcLen
) const
1217 if ( srcLen
== wxNO_LEN
)
1218 srcLen
= wxWcslen(src
) + 1;
1221 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1222 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1225 const size_t numChars
= encode_utf16(*src
, cc
);
1226 if ( numChars
== wxCONV_FAILED
)
1227 return wxCONV_FAILED
;
1229 outLen
+= numChars
* BYTES_PER_CHAR
;
1232 if ( outLen
> dstLen
)
1233 return wxCONV_FAILED
;
1235 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1236 if ( numChars
== 2 )
1238 // second character of a surrogate
1239 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1247 #endif // WC_UTF16/!WC_UTF16
1250 // ============================================================================
1252 // ============================================================================
1254 #ifdef WORDS_BIGENDIAN
1255 #define wxMBConvUTF32straight wxMBConvUTF32BE
1256 #define wxMBConvUTF32swap wxMBConvUTF32LE
1258 #define wxMBConvUTF32swap wxMBConvUTF32BE
1259 #define wxMBConvUTF32straight wxMBConvUTF32LE
1263 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1264 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1267 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1269 if ( srcLen
== wxNO_LEN
)
1271 // count the number of bytes in input, including the trailing NULs
1272 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1273 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1276 srcLen
*= BYTES_PER_CHAR
;
1278 else // we already have the length
1280 // we can only convert an entire number of UTF-32 characters
1281 if ( srcLen
% BYTES_PER_CHAR
)
1282 return wxCONV_FAILED
;
1288 // case when in-memory representation is UTF-16
1291 // ----------------------------------------------------------------------------
1292 // conversions without endianness change
1293 // ----------------------------------------------------------------------------
1296 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1297 const char *src
, size_t srcLen
) const
1299 srcLen
= GetLength(src
, srcLen
);
1300 if ( srcLen
== wxNO_LEN
)
1301 return wxCONV_FAILED
;
1303 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1304 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1306 for ( size_t n
= 0; n
< inLen
; n
++ )
1309 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1310 if ( numChars
== wxCONV_FAILED
)
1311 return wxCONV_FAILED
;
1316 if ( outLen
> dstLen
)
1317 return wxCONV_FAILED
;
1320 if ( numChars
== 2 )
1322 // second character of a surrogate
1332 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1333 const wchar_t *src
, size_t srcLen
) const
1335 if ( srcLen
== wxNO_LEN
)
1336 srcLen
= wxWcslen(src
) + 1;
1340 // optimization: return maximal space which could be needed for this
1341 // string instead of the exact amount which could be less if there are
1342 // any surrogates in the input
1344 // we consider that surrogates are rare enough to make it worthwhile to
1345 // avoid running the loop below at the cost of slightly extra memory
1347 return srcLen
* BYTES_PER_CHAR
;
1350 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1352 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1354 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1356 return wxCONV_FAILED
;
1358 outLen
+= BYTES_PER_CHAR
;
1360 if ( outLen
> dstLen
)
1361 return wxCONV_FAILED
;
1369 // ----------------------------------------------------------------------------
1370 // endian-reversing conversions
1371 // ----------------------------------------------------------------------------
1374 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1375 const char *src
, size_t srcLen
) const
1377 srcLen
= GetLength(src
, srcLen
);
1378 if ( srcLen
== wxNO_LEN
)
1379 return wxCONV_FAILED
;
1381 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1382 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1384 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1387 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1388 if ( numChars
== wxCONV_FAILED
)
1389 return wxCONV_FAILED
;
1394 if ( outLen
> dstLen
)
1395 return wxCONV_FAILED
;
1398 if ( numChars
== 2 )
1400 // second character of a surrogate
1410 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1411 const wchar_t *src
, size_t srcLen
) const
1413 if ( srcLen
== wxNO_LEN
)
1414 srcLen
= wxWcslen(src
) + 1;
1418 // optimization: return maximal space which could be needed for this
1419 // string instead of the exact amount which could be less if there are
1420 // any surrogates in the input
1422 // we consider that surrogates are rare enough to make it worthwhile to
1423 // avoid running the loop below at the cost of slightly extra memory
1425 return srcLen
*BYTES_PER_CHAR
;
1428 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1430 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1432 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1434 return wxCONV_FAILED
;
1436 outLen
+= BYTES_PER_CHAR
;
1438 if ( outLen
> dstLen
)
1439 return wxCONV_FAILED
;
1441 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1447 #else // !WC_UTF16: wchar_t is UTF-32
1449 // ----------------------------------------------------------------------------
1450 // conversions without endianness change
1451 // ----------------------------------------------------------------------------
1454 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1455 const char *src
, size_t srcLen
) const
1457 // use memcpy() as it should be much faster than hand-written loop
1458 srcLen
= GetLength(src
, srcLen
);
1459 if ( srcLen
== wxNO_LEN
)
1460 return wxCONV_FAILED
;
1462 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1465 if ( dstLen
< inLen
)
1466 return wxCONV_FAILED
;
1468 memcpy(dst
, src
, srcLen
);
1475 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1476 const wchar_t *src
, size_t srcLen
) const
1478 if ( srcLen
== wxNO_LEN
)
1479 srcLen
= wxWcslen(src
) + 1;
1481 srcLen
*= BYTES_PER_CHAR
;
1485 if ( dstLen
< srcLen
)
1486 return wxCONV_FAILED
;
1488 memcpy(dst
, src
, srcLen
);
1494 // ----------------------------------------------------------------------------
1495 // endian-reversing conversions
1496 // ----------------------------------------------------------------------------
1499 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1500 const char *src
, size_t srcLen
) const
1502 srcLen
= GetLength(src
, srcLen
);
1503 if ( srcLen
== wxNO_LEN
)
1504 return wxCONV_FAILED
;
1506 srcLen
/= BYTES_PER_CHAR
;
1510 if ( dstLen
< srcLen
)
1511 return wxCONV_FAILED
;
1513 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1514 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1516 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1524 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1525 const wchar_t *src
, size_t srcLen
) const
1527 if ( srcLen
== wxNO_LEN
)
1528 srcLen
= wxWcslen(src
) + 1;
1530 srcLen
*= BYTES_PER_CHAR
;
1534 if ( dstLen
< srcLen
)
1535 return wxCONV_FAILED
;
1537 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1538 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1540 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1547 #endif // WC_UTF16/!WC_UTF16
1550 // ============================================================================
1551 // The classes doing conversion using the iconv_xxx() functions
1552 // ============================================================================
1556 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1557 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1558 // (unless there's yet another bug in glibc) the only case when iconv()
1559 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1560 // left in the input buffer -- when _real_ error occurs,
1561 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1563 // [This bug does not appear in glibc 2.2.]
1564 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1565 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1566 (errno != E2BIG || bufLeft != 0))
1568 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1571 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1573 #define ICONV_T_INVALID ((iconv_t)-1)
1575 #if SIZEOF_WCHAR_T == 4
1576 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1577 #define WC_ENC wxFONTENCODING_UTF32
1578 #elif SIZEOF_WCHAR_T == 2
1579 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1580 #define WC_ENC wxFONTENCODING_UTF16
1581 #else // sizeof(wchar_t) != 2 nor 4
1582 // does this ever happen?
1583 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1586 // ----------------------------------------------------------------------------
1587 // wxMBConv_iconv: encapsulates an iconv character set
1588 // ----------------------------------------------------------------------------
1590 class wxMBConv_iconv
: public wxMBConv
1593 wxMBConv_iconv(const wxChar
*name
);
1594 virtual ~wxMBConv_iconv();
1596 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1597 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1599 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1600 virtual size_t GetMBNulLen() const;
1602 virtual wxMBConv
*Clone() const
1604 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
);
1605 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1610 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1613 // the iconv handlers used to translate from multibyte
1614 // to wide char and in the other direction
1619 // guards access to m2w and w2m objects
1620 wxMutex m_iconvMutex
;
1624 // the name (for iconv_open()) of a wide char charset -- if none is
1625 // available on this machine, it will remain NULL
1626 static wxString ms_wcCharsetName
;
1628 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1629 // different endian-ness than the native one
1630 static bool ms_wcNeedsSwap
;
1633 // name of the encoding handled by this conversion
1636 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1638 size_t m_minMBCharWidth
;
1641 // make the constructor available for unit testing
1642 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const wxChar
* name
)
1644 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1645 if ( !result
->IsOk() )
1654 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1655 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1657 wxMBConv_iconv::wxMBConv_iconv(const wxChar
*name
)
1660 m_minMBCharWidth
= 0;
1662 // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1663 // names for the charsets
1664 const wxCharBuffer
cname(wxString(name
).ToAscii());
1666 // check for charset that represents wchar_t:
1667 if ( ms_wcCharsetName
.empty() )
1669 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1672 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1673 #else // !wxUSE_FONTMAP
1674 static const wxChar
*names
[] =
1676 #if SIZEOF_WCHAR_T == 4
1678 #elif SIZEOF_WCHAR_T = 2
1683 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1685 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1687 const wxString
nameCS(*names
);
1689 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1690 wxString
nameXE(nameCS
);
1692 #ifdef WORDS_BIGENDIAN
1694 #else // little endian
1698 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1701 m2w
= iconv_open(nameXE
.ToAscii(), cname
);
1702 if ( m2w
== ICONV_T_INVALID
)
1704 // try charset w/o bytesex info (e.g. "UCS4")
1705 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1707 m2w
= iconv_open(nameCS
.ToAscii(), cname
);
1709 // and check for bytesex ourselves:
1710 if ( m2w
!= ICONV_T_INVALID
)
1712 char buf
[2], *bufPtr
;
1713 wchar_t wbuf
[2], *wbufPtr
;
1721 outsz
= SIZEOF_WCHAR_T
* 2;
1726 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1727 (char**)&wbufPtr
, &outsz
);
1729 if (ICONV_FAILED(res
, insz
))
1731 wxLogLastError(wxT("iconv"));
1732 wxLogError(_("Conversion to charset '%s' doesn't work."),
1735 else // ok, can convert to this encoding, remember it
1737 ms_wcCharsetName
= nameCS
;
1738 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1742 else // use charset not requiring byte swapping
1744 ms_wcCharsetName
= nameXE
;
1748 wxLogTrace(TRACE_STRCONV
,
1749 wxT("iconv wchar_t charset is \"%s\"%s"),
1750 ms_wcCharsetName
.empty() ? _T("<none>")
1751 : ms_wcCharsetName
.c_str(),
1752 ms_wcNeedsSwap
? _T(" (needs swap)")
1755 else // we already have ms_wcCharsetName
1757 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), cname
);
1760 if ( ms_wcCharsetName
.empty() )
1762 w2m
= ICONV_T_INVALID
;
1766 w2m
= iconv_open(cname
, ms_wcCharsetName
.ToAscii());
1767 if ( w2m
== ICONV_T_INVALID
)
1769 wxLogTrace(TRACE_STRCONV
,
1770 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1771 ms_wcCharsetName
.c_str(), cname
.data());
1776 wxMBConv_iconv::~wxMBConv_iconv()
1778 if ( m2w
!= ICONV_T_INVALID
)
1780 if ( w2m
!= ICONV_T_INVALID
)
1784 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
1786 // find the string length: notice that must be done differently for
1787 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1789 const size_t nulLen
= GetMBNulLen();
1793 return wxCONV_FAILED
;
1796 inbuf
= strlen(psz
); // arguably more optimized than our version
1801 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1802 // they also have to start at character boundary and not span two
1803 // adjacent characters
1805 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
1812 // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1813 // Unfortunately there is a couple of global wxCSConv objects such as
1814 // wxConvLocal that are used all over wx code, so we have to make sure
1815 // the handle is used by at most one thread at the time. Otherwise
1816 // only a few wx classes would be safe to use from non-main threads
1817 // as MB<->WC conversion would fail "randomly".
1818 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1819 #endif // wxUSE_THREADS
1821 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
1823 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1824 wchar_t *bufPtr
= buf
;
1825 const char *pszPtr
= psz
;
1829 // have destination buffer, convert there
1831 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1832 (char**)&bufPtr
, &outbuf
);
1833 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
1837 // convert to native endianness
1838 for ( unsigned i
= 0; i
< res
; i
++ )
1839 buf
[n
] = WC_BSWAP(buf
[i
]);
1842 // NUL-terminate the string if there is any space left
1848 // no destination buffer... convert using temp buffer
1849 // to calculate destination buffer requirement
1856 outbuf
= 8 * SIZEOF_WCHAR_T
;
1859 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
1860 (char**)&bufPtr
, &outbuf
);
1862 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
1864 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1867 if (ICONV_FAILED(cres
, inbuf
))
1869 //VS: it is ok if iconv fails, hence trace only
1870 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1871 return wxCONV_FAILED
;
1877 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1880 // NB: explained in MB2WC
1881 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
1884 size_t inlen
= wxWcslen(psz
);
1885 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
1889 wchar_t *tmpbuf
= 0;
1893 // need to copy to temp buffer to switch endianness
1894 // (doing WC_BSWAP twice on the original buffer won't help, as it
1895 // could be in read-only memory, or be accessed in some other thread)
1896 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
1897 for ( size_t i
= 0; i
< inlen
; i
++ )
1898 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
1900 tmpbuf
[inlen
] = L
'\0';
1906 // have destination buffer, convert there
1907 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1911 // NB: iconv was given only wcslen(psz) characters on input, and so
1912 // it couldn't convert the trailing zero. Let's do it ourselves
1913 // if there's some room left for it in the output buffer.
1919 // no destination buffer: convert using temp buffer
1920 // to calculate destination buffer requirement
1928 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
1932 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
1940 if (ICONV_FAILED(cres
, inbuf
))
1942 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1943 return wxCONV_FAILED
;
1949 size_t wxMBConv_iconv::GetMBNulLen() const
1951 if ( m_minMBCharWidth
== 0 )
1953 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
1956 // NB: explained in MB2WC
1957 wxMutexLocker
lock(self
->m_iconvMutex
);
1960 wchar_t *wnul
= L
"";
1961 char buf
[8]; // should be enough for NUL in any encoding
1962 size_t inLen
= sizeof(wchar_t),
1963 outLen
= WXSIZEOF(buf
);
1964 char *inBuff
= (char *)wnul
;
1965 char *outBuff
= buf
;
1966 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
1968 self
->m_minMBCharWidth
= (size_t)-1;
1972 self
->m_minMBCharWidth
= outBuff
- buf
;
1976 return m_minMBCharWidth
;
1979 #endif // HAVE_ICONV
1982 // ============================================================================
1983 // Win32 conversion classes
1984 // ============================================================================
1986 #ifdef wxHAVE_WIN32_MB2WC
1990 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const wxChar
*charset
);
1991 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
1994 class wxMBConv_win32
: public wxMBConv
1999 m_CodePage
= CP_ACP
;
2000 m_minMBCharWidth
= 0;
2003 wxMBConv_win32(const wxMBConv_win32
& conv
)
2005 m_CodePage
= conv
.m_CodePage
;
2006 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2010 wxMBConv_win32(const wxChar
* name
)
2012 m_CodePage
= wxCharsetToCodepage(name
);
2013 m_minMBCharWidth
= 0;
2016 wxMBConv_win32(wxFontEncoding encoding
)
2018 m_CodePage
= wxEncodingToCodepage(encoding
);
2019 m_minMBCharWidth
= 0;
2021 #endif // wxUSE_FONTMAP
2023 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2025 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2026 // the behaviour is not compatible with the Unix version (using iconv)
2027 // and break the library itself, e.g. wxTextInputStream::NextChar()
2028 // wouldn't work if reading an incomplete MB char didn't result in an
2031 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2032 // Win XP or newer and it is not supported for UTF-[78] so we always
2033 // use our own conversions in this case. See
2034 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2035 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2036 if ( m_CodePage
== CP_UTF8
)
2038 return wxConvUTF8
.MB2WC(buf
, psz
, n
);
2041 if ( m_CodePage
== CP_UTF7
)
2043 return wxConvUTF7
.MB2WC(buf
, psz
, n
);
2047 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2048 IsAtLeastWin2kSP4() )
2050 flags
= MB_ERR_INVALID_CHARS
;
2053 const size_t len
= ::MultiByteToWideChar
2055 m_CodePage
, // code page
2056 flags
, // flags: fall on error
2057 psz
, // input string
2058 -1, // its length (NUL-terminated)
2059 buf
, // output string
2060 buf
? n
: 0 // size of output buffer
2064 // function totally failed
2065 return wxCONV_FAILED
;
2068 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2069 // check if we succeeded, by doing a double trip:
2070 if ( !flags
&& buf
)
2072 const size_t mbLen
= strlen(psz
);
2073 wxCharBuffer
mbBuf(mbLen
);
2074 if ( ::WideCharToMultiByte
2081 mbLen
+ 1, // size in bytes, not length
2085 strcmp(mbBuf
, psz
) != 0 )
2087 // we didn't obtain the same thing we started from, hence
2088 // the conversion was lossy and we consider that it failed
2089 return wxCONV_FAILED
;
2093 // note that it returns count of written chars for buf != NULL and size
2094 // of the needed buffer for buf == NULL so in either case the length of
2095 // the string (which never includes the terminating NUL) is one less
2099 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2102 we have a problem here: by default, WideCharToMultiByte() may
2103 replace characters unrepresentable in the target code page with bad
2104 quality approximations such as turning "1/2" symbol (U+00BD) into
2105 "1" for the code pages which don't have it and we, obviously, want
2106 to avoid this at any price
2108 the trouble is that this function does it _silently_, i.e. it won't
2109 even tell us whether it did or not... Win98/2000 and higher provide
2110 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2111 we have to resort to a round trip, i.e. check that converting back
2112 results in the same string -- this is, of course, expensive but
2113 otherwise we simply can't be sure to not garble the data.
2116 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2117 // it doesn't work with CJK encodings (which we test for rather roughly
2118 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2120 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2123 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2125 // it's our lucky day
2126 flags
= WC_NO_BEST_FIT_CHARS
;
2127 pUsedDef
= &usedDef
;
2129 else // old system or unsupported encoding
2135 const size_t len
= ::WideCharToMultiByte
2137 m_CodePage
, // code page
2138 flags
, // either none or no best fit
2139 pwz
, // input string
2140 -1, // it is (wide) NUL-terminated
2141 buf
, // output buffer
2142 buf
? n
: 0, // and its size
2143 NULL
, // default "replacement" char
2144 pUsedDef
// [out] was it used?
2149 // function totally failed
2150 return wxCONV_FAILED
;
2153 // if we were really converting, check if we succeeded
2158 // check if the conversion failed, i.e. if any replacements
2161 return wxCONV_FAILED
;
2163 else // we must resort to double tripping...
2165 wxWCharBuffer
wcBuf(n
);
2166 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2167 wcscmp(wcBuf
, pwz
) != 0 )
2169 // we didn't obtain the same thing we started from, hence
2170 // the conversion was lossy and we consider that it failed
2171 return wxCONV_FAILED
;
2176 // see the comment above for the reason of "len - 1"
2180 virtual size_t GetMBNulLen() const
2182 if ( m_minMBCharWidth
== 0 )
2184 int len
= ::WideCharToMultiByte
2186 m_CodePage
, // code page
2188 L
"", // input string
2189 1, // translate just the NUL
2190 NULL
, // output buffer
2192 NULL
, // no replacement char
2193 NULL
// [out] don't care if it was used
2196 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2200 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2201 self
->m_minMBCharWidth
= (size_t)-1;
2205 self
->m_minMBCharWidth
= (size_t)-1;
2211 self
->m_minMBCharWidth
= len
;
2216 return m_minMBCharWidth
;
2219 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2221 bool IsOk() const { return m_CodePage
!= -1; }
2224 static bool CanUseNoBestFit()
2226 static int s_isWin98Or2k
= -1;
2228 if ( s_isWin98Or2k
== -1 )
2231 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2234 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2238 s_isWin98Or2k
= verMaj
>= 5;
2242 // unknown: be conservative by default
2247 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2250 return s_isWin98Or2k
== 1;
2253 static bool IsAtLeastWin2kSP4()
2258 static int s_isAtLeastWin2kSP4
= -1;
2260 if ( s_isAtLeastWin2kSP4
== -1 )
2262 OSVERSIONINFOEX ver
;
2264 memset(&ver
, 0, sizeof(ver
));
2265 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2266 GetVersionEx((OSVERSIONINFO
*)&ver
);
2268 s_isAtLeastWin2kSP4
=
2269 ((ver
.dwMajorVersion
> 5) || // Vista+
2270 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2271 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2272 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2276 return s_isAtLeastWin2kSP4
== 1;
2281 // the code page we're working with
2284 // cached result of GetMBNulLen(), set to 0 initially meaning
2286 size_t m_minMBCharWidth
;
2289 #endif // wxHAVE_WIN32_MB2WC
2291 // ============================================================================
2292 // Cocoa conversion classes
2293 // ============================================================================
2295 #if defined(__WXCOCOA__)
2297 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2298 // Strangely enough, internally Core Foundation uses
2299 // UTF-32 internally quite a bit - its just not public (yet).
2301 #include <CoreFoundation/CFString.h>
2302 #include <CoreFoundation/CFStringEncodingExt.h>
2304 CFStringEncoding
wxCFStringEncFromFontEnc(wxFontEncoding encoding
)
2306 CFStringEncoding enc
= kCFStringEncodingInvalidId
;
2310 case wxFONTENCODING_DEFAULT
:
2311 enc
= CFStringGetSystemEncoding();
2314 case wxFONTENCODING_ISO8859_1
:
2315 enc
= kCFStringEncodingISOLatin1
;
2317 case wxFONTENCODING_ISO8859_2
:
2318 enc
= kCFStringEncodingISOLatin2
;
2320 case wxFONTENCODING_ISO8859_3
:
2321 enc
= kCFStringEncodingISOLatin3
;
2323 case wxFONTENCODING_ISO8859_4
:
2324 enc
= kCFStringEncodingISOLatin4
;
2326 case wxFONTENCODING_ISO8859_5
:
2327 enc
= kCFStringEncodingISOLatinCyrillic
;
2329 case wxFONTENCODING_ISO8859_6
:
2330 enc
= kCFStringEncodingISOLatinArabic
;
2332 case wxFONTENCODING_ISO8859_7
:
2333 enc
= kCFStringEncodingISOLatinGreek
;
2335 case wxFONTENCODING_ISO8859_8
:
2336 enc
= kCFStringEncodingISOLatinHebrew
;
2338 case wxFONTENCODING_ISO8859_9
:
2339 enc
= kCFStringEncodingISOLatin5
;
2341 case wxFONTENCODING_ISO8859_10
:
2342 enc
= kCFStringEncodingISOLatin6
;
2344 case wxFONTENCODING_ISO8859_11
:
2345 enc
= kCFStringEncodingISOLatinThai
;
2347 case wxFONTENCODING_ISO8859_13
:
2348 enc
= kCFStringEncodingISOLatin7
;
2350 case wxFONTENCODING_ISO8859_14
:
2351 enc
= kCFStringEncodingISOLatin8
;
2353 case wxFONTENCODING_ISO8859_15
:
2354 enc
= kCFStringEncodingISOLatin9
;
2357 case wxFONTENCODING_KOI8
:
2358 enc
= kCFStringEncodingKOI8_R
;
2360 case wxFONTENCODING_ALTERNATIVE
: // MS-DOS CP866
2361 enc
= kCFStringEncodingDOSRussian
;
2364 // case wxFONTENCODING_BULGARIAN :
2368 case wxFONTENCODING_CP437
:
2369 enc
= kCFStringEncodingDOSLatinUS
;
2371 case wxFONTENCODING_CP850
:
2372 enc
= kCFStringEncodingDOSLatin1
;
2374 case wxFONTENCODING_CP852
:
2375 enc
= kCFStringEncodingDOSLatin2
;
2377 case wxFONTENCODING_CP855
:
2378 enc
= kCFStringEncodingDOSCyrillic
;
2380 case wxFONTENCODING_CP866
:
2381 enc
= kCFStringEncodingDOSRussian
;
2383 case wxFONTENCODING_CP874
:
2384 enc
= kCFStringEncodingDOSThai
;
2386 case wxFONTENCODING_CP932
:
2387 enc
= kCFStringEncodingDOSJapanese
;
2389 case wxFONTENCODING_CP936
:
2390 enc
= kCFStringEncodingDOSChineseSimplif
;
2392 case wxFONTENCODING_CP949
:
2393 enc
= kCFStringEncodingDOSKorean
;
2395 case wxFONTENCODING_CP950
:
2396 enc
= kCFStringEncodingDOSChineseTrad
;
2398 case wxFONTENCODING_CP1250
:
2399 enc
= kCFStringEncodingWindowsLatin2
;
2401 case wxFONTENCODING_CP1251
:
2402 enc
= kCFStringEncodingWindowsCyrillic
;
2404 case wxFONTENCODING_CP1252
:
2405 enc
= kCFStringEncodingWindowsLatin1
;
2407 case wxFONTENCODING_CP1253
:
2408 enc
= kCFStringEncodingWindowsGreek
;
2410 case wxFONTENCODING_CP1254
:
2411 enc
= kCFStringEncodingWindowsLatin5
;
2413 case wxFONTENCODING_CP1255
:
2414 enc
= kCFStringEncodingWindowsHebrew
;
2416 case wxFONTENCODING_CP1256
:
2417 enc
= kCFStringEncodingWindowsArabic
;
2419 case wxFONTENCODING_CP1257
:
2420 enc
= kCFStringEncodingWindowsBalticRim
;
2422 // This only really encodes to UTF7 (if that) evidently
2423 // case wxFONTENCODING_UTF7 :
2424 // enc = kCFStringEncodingNonLossyASCII ;
2426 case wxFONTENCODING_UTF8
:
2427 enc
= kCFStringEncodingUTF8
;
2429 case wxFONTENCODING_EUC_JP
:
2430 enc
= kCFStringEncodingEUC_JP
;
2432 case wxFONTENCODING_UTF16
:
2433 enc
= kCFStringEncodingUnicode
;
2435 case wxFONTENCODING_MACROMAN
:
2436 enc
= kCFStringEncodingMacRoman
;
2438 case wxFONTENCODING_MACJAPANESE
:
2439 enc
= kCFStringEncodingMacJapanese
;
2441 case wxFONTENCODING_MACCHINESETRAD
:
2442 enc
= kCFStringEncodingMacChineseTrad
;
2444 case wxFONTENCODING_MACKOREAN
:
2445 enc
= kCFStringEncodingMacKorean
;
2447 case wxFONTENCODING_MACARABIC
:
2448 enc
= kCFStringEncodingMacArabic
;
2450 case wxFONTENCODING_MACHEBREW
:
2451 enc
= kCFStringEncodingMacHebrew
;
2453 case wxFONTENCODING_MACGREEK
:
2454 enc
= kCFStringEncodingMacGreek
;
2456 case wxFONTENCODING_MACCYRILLIC
:
2457 enc
= kCFStringEncodingMacCyrillic
;
2459 case wxFONTENCODING_MACDEVANAGARI
:
2460 enc
= kCFStringEncodingMacDevanagari
;
2462 case wxFONTENCODING_MACGURMUKHI
:
2463 enc
= kCFStringEncodingMacGurmukhi
;
2465 case wxFONTENCODING_MACGUJARATI
:
2466 enc
= kCFStringEncodingMacGujarati
;
2468 case wxFONTENCODING_MACORIYA
:
2469 enc
= kCFStringEncodingMacOriya
;
2471 case wxFONTENCODING_MACBENGALI
:
2472 enc
= kCFStringEncodingMacBengali
;
2474 case wxFONTENCODING_MACTAMIL
:
2475 enc
= kCFStringEncodingMacTamil
;
2477 case wxFONTENCODING_MACTELUGU
:
2478 enc
= kCFStringEncodingMacTelugu
;
2480 case wxFONTENCODING_MACKANNADA
:
2481 enc
= kCFStringEncodingMacKannada
;
2483 case wxFONTENCODING_MACMALAJALAM
:
2484 enc
= kCFStringEncodingMacMalayalam
;
2486 case wxFONTENCODING_MACSINHALESE
:
2487 enc
= kCFStringEncodingMacSinhalese
;
2489 case wxFONTENCODING_MACBURMESE
:
2490 enc
= kCFStringEncodingMacBurmese
;
2492 case wxFONTENCODING_MACKHMER
:
2493 enc
= kCFStringEncodingMacKhmer
;
2495 case wxFONTENCODING_MACTHAI
:
2496 enc
= kCFStringEncodingMacThai
;
2498 case wxFONTENCODING_MACLAOTIAN
:
2499 enc
= kCFStringEncodingMacLaotian
;
2501 case wxFONTENCODING_MACGEORGIAN
:
2502 enc
= kCFStringEncodingMacGeorgian
;
2504 case wxFONTENCODING_MACARMENIAN
:
2505 enc
= kCFStringEncodingMacArmenian
;
2507 case wxFONTENCODING_MACCHINESESIMP
:
2508 enc
= kCFStringEncodingMacChineseSimp
;
2510 case wxFONTENCODING_MACTIBETAN
:
2511 enc
= kCFStringEncodingMacTibetan
;
2513 case wxFONTENCODING_MACMONGOLIAN
:
2514 enc
= kCFStringEncodingMacMongolian
;
2516 case wxFONTENCODING_MACETHIOPIC
:
2517 enc
= kCFStringEncodingMacEthiopic
;
2519 case wxFONTENCODING_MACCENTRALEUR
:
2520 enc
= kCFStringEncodingMacCentralEurRoman
;
2522 case wxFONTENCODING_MACVIATNAMESE
:
2523 enc
= kCFStringEncodingMacVietnamese
;
2525 case wxFONTENCODING_MACARABICEXT
:
2526 enc
= kCFStringEncodingMacExtArabic
;
2528 case wxFONTENCODING_MACSYMBOL
:
2529 enc
= kCFStringEncodingMacSymbol
;
2531 case wxFONTENCODING_MACDINGBATS
:
2532 enc
= kCFStringEncodingMacDingbats
;
2534 case wxFONTENCODING_MACTURKISH
:
2535 enc
= kCFStringEncodingMacTurkish
;
2537 case wxFONTENCODING_MACCROATIAN
:
2538 enc
= kCFStringEncodingMacCroatian
;
2540 case wxFONTENCODING_MACICELANDIC
:
2541 enc
= kCFStringEncodingMacIcelandic
;
2543 case wxFONTENCODING_MACROMANIAN
:
2544 enc
= kCFStringEncodingMacRomanian
;
2546 case wxFONTENCODING_MACCELTIC
:
2547 enc
= kCFStringEncodingMacCeltic
;
2549 case wxFONTENCODING_MACGAELIC
:
2550 enc
= kCFStringEncodingMacGaelic
;
2552 // case wxFONTENCODING_MACKEYBOARD :
2553 // enc = kCFStringEncodingMacKeyboardGlyphs ;
2557 // because gcc is picky
2564 class wxMBConv_cocoa
: public wxMBConv
2569 Init(CFStringGetSystemEncoding()) ;
2572 wxMBConv_cocoa(const wxMBConv_cocoa
& conv
)
2574 m_encoding
= conv
.m_encoding
;
2578 wxMBConv_cocoa(const wxChar
* name
)
2580 Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) ) ;
2584 wxMBConv_cocoa(wxFontEncoding encoding
)
2586 Init( wxCFStringEncFromFontEnc(encoding
) );
2593 void Init( CFStringEncoding encoding
)
2595 m_encoding
= encoding
;
2598 size_t MB2WC(wchar_t * szOut
, const char * szUnConv
, size_t nOutSize
) const
2602 CFStringRef theString
= CFStringCreateWithBytes (
2603 NULL
, //the allocator
2604 (const UInt8
*)szUnConv
,
2607 false //no BOM/external representation
2610 wxASSERT(theString
);
2612 size_t nOutLength
= CFStringGetLength(theString
);
2616 CFRelease(theString
);
2620 CFRange theRange
= { 0, nOutSize
};
2622 #if SIZEOF_WCHAR_T == 4
2623 UniChar
* szUniCharBuffer
= new UniChar
[nOutSize
];
2626 CFStringGetCharacters(theString
, theRange
, szUniCharBuffer
);
2628 CFRelease(theString
);
2630 szUniCharBuffer
[nOutLength
] = '\0';
2632 #if SIZEOF_WCHAR_T == 4
2633 wxMBConvUTF16 converter
;
2634 converter
.MB2WC( szOut
, (const char*)szUniCharBuffer
, nOutSize
);
2635 delete [] szUniCharBuffer
;
2641 size_t WC2MB(char *szOut
, const wchar_t *szUnConv
, size_t nOutSize
) const
2645 size_t nRealOutSize
;
2646 size_t nBufSize
= wxWcslen(szUnConv
);
2647 UniChar
* szUniBuffer
= (UniChar
*) szUnConv
;
2649 #if SIZEOF_WCHAR_T == 4
2650 wxMBConvUTF16 converter
;
2651 nBufSize
= converter
.WC2MB( NULL
, szUnConv
, 0 );
2652 szUniBuffer
= new UniChar
[ (nBufSize
/ sizeof(UniChar
)) + 1];
2653 converter
.WC2MB( (char*) szUniBuffer
, szUnConv
, nBufSize
+ sizeof(UniChar
));
2654 nBufSize
/= sizeof(UniChar
);
2657 CFStringRef theString
= CFStringCreateWithCharactersNoCopy(
2661 kCFAllocatorNull
//deallocator - we want to deallocate it ourselves
2664 wxASSERT(theString
);
2666 //Note that CER puts a BOM when converting to unicode
2667 //so we check and use getchars instead in that case
2668 if (m_encoding
== kCFStringEncodingUnicode
)
2671 CFStringGetCharacters(theString
, CFRangeMake(0, nOutSize
- 1), (UniChar
*) szOut
);
2673 nRealOutSize
= CFStringGetLength(theString
) + 1;
2679 CFRangeMake(0, CFStringGetLength(theString
)),
2681 0, //what to put in characters that can't be converted -
2682 //0 tells CFString to return NULL if it meets such a character
2683 false, //not an external representation
2686 (CFIndex
*) &nRealOutSize
2690 CFRelease(theString
);
2692 #if SIZEOF_WCHAR_T == 4
2693 delete[] szUniBuffer
;
2696 return nRealOutSize
- 1;
2699 virtual wxMBConv
*Clone() const { return new wxMBConv_cocoa(*this); }
2703 return m_encoding
!= kCFStringEncodingInvalidId
&&
2704 CFStringIsEncodingAvailable(m_encoding
);
2708 CFStringEncoding m_encoding
;
2711 #endif // defined(__WXCOCOA__)
2713 // ============================================================================
2714 // Mac conversion classes
2715 // ============================================================================
2717 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2719 class wxMBConv_mac
: public wxMBConv
2724 Init(CFStringGetSystemEncoding()) ;
2727 wxMBConv_mac(const wxMBConv_mac
& conv
)
2729 Init(conv
.m_char_encoding
);
2733 wxMBConv_mac(const wxChar
* name
)
2735 Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name
, false) ) );
2739 wxMBConv_mac(wxFontEncoding encoding
)
2741 Init( wxMacGetSystemEncFromFontEnc(encoding
) );
2746 OSStatus status
= noErr
;
2747 status
= TECDisposeConverter(m_MB2WC_converter
);
2748 status
= TECDisposeConverter(m_WC2MB_converter
);
2752 void Init( TextEncodingBase encoding
)
2754 OSStatus status
= noErr
;
2755 m_char_encoding
= encoding
;
2756 m_unicode_encoding
= CreateTextEncoding(kTextEncodingUnicodeDefault
, 0, kUnicode16BitFormat
) ;
2758 status
= TECCreateConverter(&m_MB2WC_converter
,
2760 m_unicode_encoding
);
2761 status
= TECCreateConverter(&m_WC2MB_converter
,
2766 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2768 OSStatus status
= noErr
;
2769 ByteCount byteOutLen
;
2770 ByteCount byteInLen
= strlen(psz
) + 1;
2771 wchar_t *tbuf
= NULL
;
2772 UniChar
* ubuf
= NULL
;
2777 // Apple specs say at least 32
2778 n
= wxMax( 32, byteInLen
) ;
2779 tbuf
= (wchar_t*) malloc( n
* SIZEOF_WCHAR_T
) ;
2782 ByteCount byteBufferLen
= n
* sizeof( UniChar
) ;
2784 #if SIZEOF_WCHAR_T == 4
2785 ubuf
= (UniChar
*) malloc( byteBufferLen
+ 2 ) ;
2787 ubuf
= (UniChar
*) (buf
? buf
: tbuf
) ;
2790 status
= TECConvertText(
2791 m_MB2WC_converter
, (ConstTextPtr
) psz
, byteInLen
, &byteInLen
,
2792 (TextPtr
) ubuf
, byteBufferLen
, &byteOutLen
);
2794 #if SIZEOF_WCHAR_T == 4
2795 // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2796 // is not properly terminated we get random characters at the end
2797 ubuf
[byteOutLen
/ sizeof( UniChar
) ] = 0 ;
2798 wxMBConvUTF16 converter
;
2799 res
= converter
.MB2WC( (buf
? buf
: tbuf
), (const char*)ubuf
, n
) ;
2802 res
= byteOutLen
/ sizeof( UniChar
) ;
2808 if ( buf
&& res
< n
)
2814 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2816 OSStatus status
= noErr
;
2817 ByteCount byteOutLen
;
2818 ByteCount byteInLen
= wxWcslen(psz
) * SIZEOF_WCHAR_T
;
2824 // Apple specs say at least 32
2825 n
= wxMax( 32, ((byteInLen
/ SIZEOF_WCHAR_T
) * 8) + SIZEOF_WCHAR_T
);
2826 tbuf
= (char*) malloc( n
) ;
2829 ByteCount byteBufferLen
= n
;
2830 UniChar
* ubuf
= NULL
;
2832 #if SIZEOF_WCHAR_T == 4
2833 wxMBConvUTF16 converter
;
2834 size_t unicharlen
= converter
.WC2MB( NULL
, psz
, 0 ) ;
2835 byteInLen
= unicharlen
;
2836 ubuf
= (UniChar
*) malloc( byteInLen
+ 2 ) ;
2837 converter
.WC2MB( (char*) ubuf
, psz
, unicharlen
+ 2 ) ;
2839 ubuf
= (UniChar
*) psz
;
2842 status
= TECConvertText(
2843 m_WC2MB_converter
, (ConstTextPtr
) ubuf
, byteInLen
, &byteInLen
,
2844 (TextPtr
) (buf
? buf
: tbuf
), byteBufferLen
, &byteOutLen
);
2846 #if SIZEOF_WCHAR_T == 4
2853 size_t res
= byteOutLen
;
2854 if ( buf
&& res
< n
)
2858 //we need to double-trip to verify it didn't insert any ? in place
2859 //of bogus characters
2860 wxWCharBuffer
wcBuf(n
);
2861 size_t pszlen
= wxWcslen(psz
);
2862 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2863 wxWcslen(wcBuf
) != pszlen
||
2864 memcmp(wcBuf
, psz
, pszlen
* sizeof(wchar_t)) != 0 )
2866 // we didn't obtain the same thing we started from, hence
2867 // the conversion was lossy and we consider that it failed
2868 return wxCONV_FAILED
;
2875 virtual wxMBConv
*Clone() const { return new wxMBConv_mac(*this); }
2878 { return m_MB2WC_converter
!= NULL
&& m_WC2MB_converter
!= NULL
; }
2881 TECObjectRef m_MB2WC_converter
;
2882 TECObjectRef m_WC2MB_converter
;
2884 TextEncodingBase m_char_encoding
;
2885 TextEncodingBase m_unicode_encoding
;
2888 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2890 // ============================================================================
2891 // wxEncodingConverter based conversion classes
2892 // ============================================================================
2896 class wxMBConv_wxwin
: public wxMBConv
2901 m_ok
= m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2902 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2906 // temporarily just use wxEncodingConverter stuff,
2907 // so that it works while a better implementation is built
2908 wxMBConv_wxwin(const wxChar
* name
)
2911 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2913 m_enc
= wxFONTENCODING_SYSTEM
;
2918 wxMBConv_wxwin(wxFontEncoding enc
)
2925 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2927 size_t inbuf
= strlen(psz
);
2930 if (!m2w
.Convert(psz
, buf
))
2931 return wxCONV_FAILED
;
2936 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2938 const size_t inbuf
= wxWcslen(psz
);
2941 if (!w2m
.Convert(psz
, buf
))
2942 return wxCONV_FAILED
;
2948 virtual size_t GetMBNulLen() const
2952 case wxFONTENCODING_UTF16BE
:
2953 case wxFONTENCODING_UTF16LE
:
2956 case wxFONTENCODING_UTF32BE
:
2957 case wxFONTENCODING_UTF32LE
:
2965 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2967 bool IsOk() const { return m_ok
; }
2970 wxFontEncoding m_enc
;
2971 wxEncodingConverter m2w
, w2m
;
2974 // were we initialized successfully?
2977 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2980 // make the constructors available for unit testing
2981 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const wxChar
* name
)
2983 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2984 if ( !result
->IsOk() )
2993 #endif // wxUSE_FONTMAP
2995 // ============================================================================
2996 // wxCSConv implementation
2997 // ============================================================================
2999 void wxCSConv::Init()
3006 wxCSConv::wxCSConv(const wxChar
*charset
)
3016 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
3018 m_encoding
= wxFONTENCODING_SYSTEM
;
3022 wxCSConv::wxCSConv(wxFontEncoding encoding
)
3024 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
3026 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3028 encoding
= wxFONTENCODING_SYSTEM
;
3033 m_encoding
= encoding
;
3036 wxCSConv::~wxCSConv()
3041 wxCSConv::wxCSConv(const wxCSConv
& conv
)
3046 SetName(conv
.m_name
);
3047 m_encoding
= conv
.m_encoding
;
3050 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
3054 SetName(conv
.m_name
);
3055 m_encoding
= conv
.m_encoding
;
3060 void wxCSConv::Clear()
3069 void wxCSConv::SetName(const wxChar
*charset
)
3073 m_name
= wxStrdup(charset
);
3079 #include "wx/hashmap.h"
3081 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3082 wxEncodingNameCache
);
3084 static wxEncodingNameCache gs_nameCache
;
3087 wxMBConv
*wxCSConv::DoCreate() const
3090 wxLogTrace(TRACE_STRCONV
,
3091 wxT("creating conversion for %s"),
3093 : wxFontMapperBase::GetEncodingName(m_encoding
).c_str()));
3094 #endif // wxUSE_FONTMAP
3096 // check for the special case of ASCII or ISO8859-1 charset: as we have
3097 // special knowledge of it anyhow, we don't need to create a special
3098 // conversion object
3099 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
3100 m_encoding
== wxFONTENCODING_DEFAULT
)
3102 // don't convert at all
3106 // we trust OS to do conversion better than we can so try external
3107 // conversion methods first
3109 // the full order is:
3110 // 1. OS conversion (iconv() under Unix or Win32 API)
3111 // 2. hard coded conversions for UTF
3112 // 3. wxEncodingConverter as fall back
3118 #endif // !wxUSE_FONTMAP
3120 wxString
name(m_name
);
3121 wxFontEncoding
encoding(m_encoding
);
3123 if ( !name
.empty() )
3125 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
);
3133 wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
3134 #endif // wxUSE_FONTMAP
3138 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3139 if ( it
!= gs_nameCache
.end() )
3141 if ( it
->second
.empty() )
3144 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
);
3151 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3153 for ( ; *names
; ++names
)
3155 wxMBConv_iconv
*conv
= new wxMBConv_iconv(*names
);
3158 gs_nameCache
[encoding
] = *names
;
3165 gs_nameCache
[encoding
] = _T(""); // cache the failure
3167 #endif // wxUSE_FONTMAP
3169 #endif // HAVE_ICONV
3171 #ifdef wxHAVE_WIN32_MB2WC
3174 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3175 : new wxMBConv_win32(m_encoding
);
3184 #endif // wxHAVE_WIN32_MB2WC
3186 #if defined(__WXMAC__)
3188 // leave UTF16 and UTF32 to the built-ins of wx
3189 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3190 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3193 wxMBConv_mac
*conv
= m_name
? new wxMBConv_mac(m_name
)
3194 : new wxMBConv_mac(m_encoding
);
3196 wxMBConv_mac
*conv
= new wxMBConv_mac(m_encoding
);
3206 #if defined(__WXCOCOA__)
3208 if ( m_name
|| ( m_encoding
<= wxFONTENCODING_UTF16
) )
3211 wxMBConv_cocoa
*conv
= m_name
? new wxMBConv_cocoa(m_name
)
3212 : new wxMBConv_cocoa(m_encoding
);
3214 wxMBConv_cocoa
*conv
= new wxMBConv_cocoa(m_encoding
);
3225 wxFontEncoding enc
= m_encoding
;
3227 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3229 // use "false" to suppress interactive dialogs -- we can be called from
3230 // anywhere and popping up a dialog from here is the last thing we want to
3232 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3234 #endif // wxUSE_FONTMAP
3238 case wxFONTENCODING_UTF7
:
3239 return new wxMBConvUTF7
;
3241 case wxFONTENCODING_UTF8
:
3242 return new wxMBConvUTF8
;
3244 case wxFONTENCODING_UTF16BE
:
3245 return new wxMBConvUTF16BE
;
3247 case wxFONTENCODING_UTF16LE
:
3248 return new wxMBConvUTF16LE
;
3250 case wxFONTENCODING_UTF32BE
:
3251 return new wxMBConvUTF32BE
;
3253 case wxFONTENCODING_UTF32LE
:
3254 return new wxMBConvUTF32LE
;
3257 // nothing to do but put here to suppress gcc warnings
3264 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3265 : new wxMBConv_wxwin(m_encoding
);
3271 #endif // wxUSE_FONTMAP
3273 // NB: This is a hack to prevent deadlock. What could otherwise happen
3274 // in Unicode build: wxConvLocal creation ends up being here
3275 // because of some failure and logs the error. But wxLog will try to
3276 // attach timestamp, for which it will need wxConvLocal (to convert
3277 // time to char* and then wchar_t*), but that fails, tries to log
3278 // error, but wxLog has a (already locked) critical section that
3279 // guards static buffer.
3280 static bool alreadyLoggingError
= false;
3281 if (!alreadyLoggingError
)
3283 alreadyLoggingError
= true;
3284 wxLogError(_("Cannot convert from the charset '%s'!"),
3288 wxFontMapperBase::GetEncodingDescription(m_encoding
).c_str()
3289 #else // !wxUSE_FONTMAP
3290 wxString::Format(_("encoding %s"), m_encoding
).c_str()
3291 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3294 alreadyLoggingError
= false;
3300 void wxCSConv::CreateConvIfNeeded() const
3304 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3307 // if we don't have neither the name nor the encoding, use the default
3308 // encoding for this system
3309 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3311 self
->m_name
= wxStrdup(wxLocale::GetSystemEncodingName());
3313 #endif // wxUSE_INTL
3315 self
->m_convReal
= DoCreate();
3316 self
->m_deferred
= false;
3320 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3322 CreateConvIfNeeded();
3325 return m_convReal
->MB2WC(buf
, psz
, n
);
3328 size_t len
= strlen(psz
);
3332 for (size_t c
= 0; c
<= len
; c
++)
3333 buf
[c
] = (unsigned char)(psz
[c
]);
3339 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3341 CreateConvIfNeeded();
3344 return m_convReal
->WC2MB(buf
, psz
, n
);
3347 const size_t len
= wxWcslen(psz
);
3350 for (size_t c
= 0; c
<= len
; c
++)
3353 return wxCONV_FAILED
;
3355 buf
[c
] = (char)psz
[c
];
3360 for (size_t c
= 0; c
<= len
; c
++)
3363 return wxCONV_FAILED
;
3370 size_t wxCSConv::GetMBNulLen() const
3372 CreateConvIfNeeded();
3376 return m_convReal
->GetMBNulLen();
3382 // ----------------------------------------------------------------------------
3384 // ----------------------------------------------------------------------------
3387 static wxMBConv_win32 wxConvLibcObj
;
3388 #elif defined(__WXMAC__) && !defined(__MACH__)
3389 static wxMBConv_mac wxConvLibcObj
;
3391 static wxMBConvLibc wxConvLibcObj
;
3394 static wxCSConv
wxConvLocalObj(wxFONTENCODING_SYSTEM
);
3395 static wxCSConv
wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1
);
3396 static wxMBConvUTF7 wxConvUTF7Obj
;
3397 static wxMBConvUTF8 wxConvUTF8Obj
;
3399 WXDLLIMPEXP_DATA_BASE(wxMBConv
&) wxConvLibc
= wxConvLibcObj
;
3400 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvLocal
= wxConvLocalObj
;
3401 WXDLLIMPEXP_DATA_BASE(wxCSConv
&) wxConvISO8859_1
= wxConvISO8859_1Obj
;
3402 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7
&) wxConvUTF7
= wxConvUTF7Obj
;
3403 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8
&) wxConvUTF8
= wxConvUTF8Obj
;
3404 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= &wxConvLibcObj
;
3405 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= &wxConvLocal
;
3406 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
= &
3413 #else // !wxUSE_WCHAR_T
3415 // stand-ins in absence of wchar_t
3416 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3421 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T