1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
53 #include "wx/thread.h"
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
64 #define TRACE_STRCONV _T("strconv")
66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
68 #if SIZEOF_WCHAR_T == 2
73 // ============================================================================
75 // ============================================================================
77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
78 static bool NotAllNULs(const char *p
, size_t n
)
80 while ( n
&& *p
++ == '\0' )
86 // ----------------------------------------------------------------------------
87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
88 // ----------------------------------------------------------------------------
90 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
95 *output
= (wxUint16
) input
;
99 else if (input
>= 0x110000)
101 return wxCONV_FAILED
;
107 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
108 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
115 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
117 if ((*input
< 0xd800) || (*input
> 0xdfff))
122 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
125 return wxCONV_FAILED
;
129 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
135 typedef wchar_t wxDecodeSurrogate_t
;
137 typedef wxUint16 wxDecodeSurrogate_t
;
138 #endif // WC_UTF16/!WC_UTF16
140 // returns the next UTF-32 character from the wchar_t buffer and advances the
141 // pointer to the character after this one
143 // if an invalid character is found, *pSrc is set to NULL, the caller must
145 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
149 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
150 if ( n
== wxCONV_FAILED
)
158 // ----------------------------------------------------------------------------
160 // ----------------------------------------------------------------------------
163 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
164 const char *src
, size_t srcLen
) const
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten
= 0;
176 // the number of NULs terminating this string
177 size_t nulLen
= 0; // not really needed, but just to avoid warnings
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
185 if ( srcLen
!= wxNO_LEN
)
187 // we need to know how to find the end of this string
188 nulLen
= GetMBNulLen();
189 if ( nulLen
== wxCONV_FAILED
)
190 return wxCONV_FAILED
;
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
197 char * const p
= bufTmp
.data();
198 memcpy(p
, src
, srcLen
);
199 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
205 srcEnd
= src
+ srcLen
;
207 else // quit after the first loop iteration
214 // try to convert the current chunk
215 size_t lenChunk
= MB2WC(NULL
, src
, 0);
216 if ( lenChunk
== wxCONV_FAILED
)
217 return wxCONV_FAILED
;
219 lenChunk
++; // for the L'\0' at the end of this chunk
221 dstWritten
+= lenChunk
;
225 // nothing left in the input string, conversion succeeded
231 if ( dstWritten
> dstLen
)
232 return wxCONV_FAILED
;
234 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
235 return wxCONV_FAILED
;
242 // we convert just one chunk in this case as this is the entire
247 // advance the input pointer past the end of this chunk
248 while ( NotAllNULs(src
, nulLen
) )
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
257 src
+= nulLen
; // skipping over its terminator as well
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
270 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
271 const wchar_t *src
, size_t srcLen
) const
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten
= 0;
276 // make a copy of the input string unless it is already properly
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp
;
282 if ( srcLen
== wxNO_LEN
)
284 srcLen
= wxWcslen(src
) + 1;
286 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
288 // make a copy in order to properly NUL-terminate the string
289 bufTmp
= wxWCharBuffer(srcLen
);
290 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
294 const size_t lenNul
= GetMBNulLen();
295 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
297 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
299 // try to convert the current chunk
300 size_t lenChunk
= WC2MB(NULL
, src
, 0);
302 if ( lenChunk
== wxCONV_FAILED
)
303 return wxCONV_FAILED
;
306 dstWritten
+= lenChunk
;
310 if ( dstWritten
> dstLen
)
311 return wxCONV_FAILED
;
313 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
314 return wxCONV_FAILED
;
323 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
325 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
326 if ( rc
!= wxCONV_FAILED
)
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
336 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
338 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
339 if ( rc
!= wxCONV_FAILED
)
347 wxMBConv::~wxMBConv()
349 // nothing to do here (necessary for Darwin linking probably)
352 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
356 // calculate the length of the buffer needed first
357 const size_t nLen
= ToWChar(NULL
, 0, psz
);
358 if ( nLen
!= wxCONV_FAILED
)
360 // now do the actual conversion
361 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
363 // +1 for the trailing NULL
364 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
369 return wxWCharBuffer();
372 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
376 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
377 if ( nLen
!= wxCONV_FAILED
)
379 wxCharBuffer
buf(nLen
- 1);
380 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
385 return wxCharBuffer();
389 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
391 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
392 if ( dstLen
!= wxCONV_FAILED
)
394 // notice that we allocate space for dstLen+1 wide characters here
395 // because we want the buffer to always be NUL-terminated, even if the
396 // input isn't (as otherwise the caller has no way to know its length)
397 wxWCharBuffer
wbuf(dstLen
);
398 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
403 if ( wbuf
[dstLen
- 1] == L
'\0' )
414 return wxWCharBuffer();
418 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
420 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
421 if ( dstLen
!= wxCONV_FAILED
)
423 const size_t nulLen
= GetMBNulLen();
425 // as above, ensure that the buffer is always NUL-terminated, even if
427 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
428 memset(buf
.data() + dstLen
, 0, nulLen
);
429 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
435 if ( dstLen
>= nulLen
&&
436 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
438 // in this case the output is NUL-terminated and we're not
439 // supposed to count NUL
451 return wxCharBuffer();
454 // ----------------------------------------------------------------------------
456 // ----------------------------------------------------------------------------
458 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
460 return wxMB2WC(buf
, psz
, n
);
463 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
465 return wxWC2MB(buf
, psz
, n
);
468 // ----------------------------------------------------------------------------
469 // wxConvBrokenFileNames
470 // ----------------------------------------------------------------------------
474 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
476 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
477 wxStricmp(charset
, _T("UTF8")) == 0 )
478 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
480 m_conv
= new wxCSConv(charset
);
485 // ----------------------------------------------------------------------------
487 // ----------------------------------------------------------------------------
489 // Implementation (C) 2004 Fredrik Roubert
492 // BASE64 decoding table
494 static const unsigned char utf7unb64
[] =
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
502 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
503 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
504 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
505 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
506 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
507 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
509 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
510 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
511 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
530 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
534 while ( *psz
&& (!buf
|| (len
< n
)) )
536 unsigned char cc
= *psz
++;
544 else if (*psz
== '-')
552 else // start of BASE64 encoded string
556 for ( ok
= lsb
= false, d
= 0, l
= 0;
557 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
562 for (l
+= 6; l
>= 8; lsb
= !lsb
)
564 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
574 *buf
= (wchar_t)(c
<< 8);
583 // in valid UTF7 we should have valid characters after '+'
584 return wxCONV_FAILED
;
592 if ( buf
&& (len
< n
) )
599 // BASE64 encoding table
601 static const unsigned char utf7enb64
[] =
603 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
604 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
605 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
606 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
607 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
608 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
609 'w', 'x', 'y', 'z', '0', '1', '2', '3',
610 '4', '5', '6', '7', '8', '9', '+', '/'
614 // UTF-7 encoding table
616 // 0 - Set D (directly encoded characters)
617 // 1 - Set O (optional direct characters)
618 // 2 - whitespace characters (optional)
619 // 3 - special characters
621 static const unsigned char utf7encode
[128] =
623 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
624 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
625 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
627 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
629 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
633 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
637 while (*psz
&& ((!buf
) || (len
< n
)))
640 if (cc
< 0x80 && utf7encode
[cc
] < 1)
649 else if (((wxUint32
)cc
) > 0xffff)
651 // no surrogate pair generation (yet?)
652 return wxCONV_FAILED
;
663 // BASE64 encode string
664 unsigned int lsb
, d
, l
;
665 for (d
= 0, l
= 0; /*nothing*/; psz
++)
667 for (lsb
= 0; lsb
< 2; lsb
++)
670 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
672 for (l
+= 8; l
>= 6; )
676 *buf
++ = utf7enb64
[(d
>> l
) % 64];
682 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
689 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
701 if (buf
&& (len
< n
))
707 // ----------------------------------------------------------------------------
709 // ----------------------------------------------------------------------------
711 static const wxUint32 utf8_max
[]=
712 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
714 // boundaries of the private use area we use to (temporarily) remap invalid
715 // characters invalid in a UTF-8 encoded string
716 const wxUint32 wxUnicodePUA
= 0x100000;
717 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
719 // this table gives the length of the UTF-8 encoding from its first character:
720 const unsigned char tableUtf8Lengths
[256] = {
721 // single-byte sequences (ASCII):
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
726 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
727 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
728 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
729 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
731 // these are invalid:
732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
735 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
738 // two-byte sequences:
739 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
740 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
742 // three-byte sequences:
743 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
745 // four-byte sequences:
746 4, 4, 4, 4, 4, // F0..F4
748 // these are invalid again (5- or 6-byte
749 // sequences and sequences for code points
750 // above U+10FFFF, as restricted by RFC 3629):
751 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
755 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
756 const char *src
, size_t srcLen
) const
758 wchar_t *out
= dstLen
? dst
: NULL
;
761 if ( srcLen
== wxNO_LEN
)
762 srcLen
= strlen(src
) + 1;
764 for ( const char *p
= src
; ; p
++ )
766 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
768 // all done successfully, just add the trailing NULL if we are not
769 // using explicit length
770 if ( srcLen
== wxNO_LEN
)
786 if ( out
&& !dstLen
-- )
790 unsigned char c
= *p
;
794 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
797 if ( srcLen
!= wxNO_LEN
)
804 unsigned len
= tableUtf8Lengths
[c
];
808 if ( srcLen
< len
) // the test works for wxNO_LEN too
811 if ( srcLen
!= wxNO_LEN
)
814 // Char. number range | UTF-8 octet sequence
815 // (hexadecimal) | (binary)
816 // ----------------------+----------------------------------------
817 // 0000 0000 - 0000 007F | 0xxxxxxx
818 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
819 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
820 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
822 // Code point value is stored in bits marked with 'x',
823 // lowest-order bit of the value on the right side in the diagram
824 // above. (from RFC 3629)
826 // mask to extract lead byte's value ('x' bits above), by sequence
828 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
830 // mask and value of lead byte's most significant bits, by length:
831 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
832 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
834 len
--; // it's more convenient to work with 0-based length here
836 // extract the lead byte's value bits:
837 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
840 code
= c
& leadValueMask
[len
];
842 // all remaining bytes, if any, are handled in the same way
843 // regardless of sequence's length:
847 if ( (c
& 0xC0) != 0x80 )
848 return wxCONV_FAILED
;
856 // cast is ok because wchar_t == wxUint16 if WC_UTF16
857 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
866 #endif // WC_UTF16/!WC_UTF16
874 return wxCONV_FAILED
;
878 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
879 const wchar_t *src
, size_t srcLen
) const
881 char *out
= dstLen
? dst
: NULL
;
884 for ( const wchar_t *wp
= src
; ; wp
++ )
886 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
--) )
888 // all done successfully, just add the trailing NULL if we are not
889 // using explicit length
890 if ( srcLen
== wxNO_LEN
)
909 // cast is ok for WC_UTF16
910 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
912 // skip the next char too as we decoded a surrogate
915 #else // wchar_t is UTF-32
916 code
= *wp
& 0x7fffffff;
931 else if ( code
<= 0x07FF )
939 // NB: this line takes 6 least significant bits, encodes them as
940 // 10xxxxxx and discards them so that the next byte can be encoded:
941 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
942 out
[0] = 0xC0 | code
;
945 else if ( code
< 0xFFFF )
953 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
954 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
955 out
[0] = 0xE0 | code
;
958 else if ( code
<= 0x10FFFF )
966 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
967 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
968 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
969 out
[0] = 0xF0 | code
;
974 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
987 // we only get here if an error occurs during decoding
988 return wxCONV_FAILED
;
991 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
992 const char *psz
, size_t srcLen
) const
994 if ( m_options
== MAP_INVALID_UTF8_NOT
)
995 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
999 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1001 const char *opsz
= psz
;
1002 bool invalid
= false;
1003 unsigned char cc
= *psz
++, fc
= cc
;
1005 for (cnt
= 0; fc
& 0x80; cnt
++)
1015 // escape the escape character for octal escapes
1016 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1017 && cc
== '\\' && (!buf
|| len
< n
))
1029 // invalid UTF-8 sequence
1034 unsigned ocnt
= cnt
- 1;
1035 wxUint32 res
= cc
& (0x3f >> cnt
);
1039 if ((cc
& 0xC0) != 0x80)
1041 // invalid UTF-8 sequence
1047 res
= (res
<< 6) | (cc
& 0x3f);
1050 if (invalid
|| res
<= utf8_max
[ocnt
])
1052 // illegal UTF-8 encoding
1055 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1056 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1058 // if one of our PUA characters turns up externally
1059 // it must also be treated as an illegal sequence
1060 // (a bit like you have to escape an escape character)
1066 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1067 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1068 if (pa
== wxCONV_FAILED
)
1080 *buf
++ = (wchar_t)res
;
1082 #endif // WC_UTF16/!WC_UTF16
1088 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1090 while (opsz
< psz
&& (!buf
|| len
< n
))
1093 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1094 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1095 wxASSERT(pa
!= wxCONV_FAILED
);
1102 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1108 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1110 while (opsz
< psz
&& (!buf
|| len
< n
))
1112 if ( buf
&& len
+ 3 < n
)
1114 unsigned char on
= *opsz
;
1116 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1117 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1118 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1125 else // MAP_INVALID_UTF8_NOT
1127 return wxCONV_FAILED
;
1133 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1139 static inline bool isoctal(wchar_t wch
)
1141 return L
'0' <= wch
&& wch
<= L
'7';
1144 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1145 const wchar_t *psz
, size_t srcLen
) const
1147 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1148 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1152 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1157 // cast is ok for WC_UTF16
1158 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1159 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1161 cc
= (*psz
++) & 0x7fffffff;
1164 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1165 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1168 *buf
++ = (char)(cc
- wxUnicodePUA
);
1171 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1172 && cc
== L
'\\' && psz
[0] == L
'\\' )
1179 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1181 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1185 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1186 (psz
[1] - L
'0') * 010 +
1196 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1212 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1214 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1220 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1226 // ============================================================================
1228 // ============================================================================
1230 #ifdef WORDS_BIGENDIAN
1231 #define wxMBConvUTF16straight wxMBConvUTF16BE
1232 #define wxMBConvUTF16swap wxMBConvUTF16LE
1234 #define wxMBConvUTF16swap wxMBConvUTF16BE
1235 #define wxMBConvUTF16straight wxMBConvUTF16LE
1239 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1241 if ( srcLen
== wxNO_LEN
)
1243 // count the number of bytes in input, including the trailing NULs
1244 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1245 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1248 srcLen
*= BYTES_PER_CHAR
;
1250 else // we already have the length
1252 // we can only convert an entire number of UTF-16 characters
1253 if ( srcLen
% BYTES_PER_CHAR
)
1254 return wxCONV_FAILED
;
1260 // case when in-memory representation is UTF-16 too
1263 // ----------------------------------------------------------------------------
1264 // conversions without endianness change
1265 // ----------------------------------------------------------------------------
1268 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1269 const char *src
, size_t srcLen
) const
1271 // set up the scene for using memcpy() (which is presumably more efficient
1272 // than copying the bytes one by one)
1273 srcLen
= GetLength(src
, srcLen
);
1274 if ( srcLen
== wxNO_LEN
)
1275 return wxCONV_FAILED
;
1277 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1280 if ( dstLen
< inLen
)
1281 return wxCONV_FAILED
;
1283 memcpy(dst
, src
, srcLen
);
1290 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1291 const wchar_t *src
, size_t srcLen
) const
1293 if ( srcLen
== wxNO_LEN
)
1294 srcLen
= wxWcslen(src
) + 1;
1296 srcLen
*= BYTES_PER_CHAR
;
1300 if ( dstLen
< srcLen
)
1301 return wxCONV_FAILED
;
1303 memcpy(dst
, src
, srcLen
);
1309 // ----------------------------------------------------------------------------
1310 // endian-reversing conversions
1311 // ----------------------------------------------------------------------------
1314 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1315 const char *src
, size_t srcLen
) const
1317 srcLen
= GetLength(src
, srcLen
);
1318 if ( srcLen
== wxNO_LEN
)
1319 return wxCONV_FAILED
;
1321 srcLen
/= BYTES_PER_CHAR
;
1325 if ( dstLen
< srcLen
)
1326 return wxCONV_FAILED
;
1328 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1329 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1331 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1339 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1340 const wchar_t *src
, size_t srcLen
) const
1342 if ( srcLen
== wxNO_LEN
)
1343 srcLen
= wxWcslen(src
) + 1;
1345 srcLen
*= BYTES_PER_CHAR
;
1349 if ( dstLen
< srcLen
)
1350 return wxCONV_FAILED
;
1352 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1353 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1355 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1362 #else // !WC_UTF16: wchar_t is UTF-32
1364 // ----------------------------------------------------------------------------
1365 // conversions without endianness change
1366 // ----------------------------------------------------------------------------
1369 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1370 const char *src
, size_t srcLen
) const
1372 srcLen
= GetLength(src
, srcLen
);
1373 if ( srcLen
== wxNO_LEN
)
1374 return wxCONV_FAILED
;
1376 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1379 // optimization: return maximal space which could be needed for this
1380 // string even if the real size could be smaller if the buffer contains
1386 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1387 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1389 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1391 return wxCONV_FAILED
;
1393 if ( ++outLen
> dstLen
)
1394 return wxCONV_FAILED
;
1404 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1405 const wchar_t *src
, size_t srcLen
) const
1407 if ( srcLen
== wxNO_LEN
)
1408 srcLen
= wxWcslen(src
) + 1;
1411 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1412 for ( size_t n
= 0; n
< srcLen
; n
++ )
1415 const size_t numChars
= encode_utf16(*src
++, cc
);
1416 if ( numChars
== wxCONV_FAILED
)
1417 return wxCONV_FAILED
;
1419 outLen
+= numChars
* BYTES_PER_CHAR
;
1422 if ( outLen
> dstLen
)
1423 return wxCONV_FAILED
;
1426 if ( numChars
== 2 )
1428 // second character of a surrogate
1437 // ----------------------------------------------------------------------------
1438 // endian-reversing conversions
1439 // ----------------------------------------------------------------------------
1442 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1443 const char *src
, size_t srcLen
) const
1445 srcLen
= GetLength(src
, srcLen
);
1446 if ( srcLen
== wxNO_LEN
)
1447 return wxCONV_FAILED
;
1449 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1452 // optimization: return maximal space which could be needed for this
1453 // string even if the real size could be smaller if the buffer contains
1459 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1460 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1465 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1467 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1469 const size_t numChars
= decode_utf16(tmp
, ch
);
1470 if ( numChars
== wxCONV_FAILED
)
1471 return wxCONV_FAILED
;
1473 if ( numChars
== 2 )
1476 if ( ++outLen
> dstLen
)
1477 return wxCONV_FAILED
;
1487 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1488 const wchar_t *src
, size_t srcLen
) const
1490 if ( srcLen
== wxNO_LEN
)
1491 srcLen
= wxWcslen(src
) + 1;
1494 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1495 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1498 const size_t numChars
= encode_utf16(*src
, cc
);
1499 if ( numChars
== wxCONV_FAILED
)
1500 return wxCONV_FAILED
;
1502 outLen
+= numChars
* BYTES_PER_CHAR
;
1505 if ( outLen
> dstLen
)
1506 return wxCONV_FAILED
;
1508 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1509 if ( numChars
== 2 )
1511 // second character of a surrogate
1512 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1520 #endif // WC_UTF16/!WC_UTF16
1523 // ============================================================================
1525 // ============================================================================
1527 #ifdef WORDS_BIGENDIAN
1528 #define wxMBConvUTF32straight wxMBConvUTF32BE
1529 #define wxMBConvUTF32swap wxMBConvUTF32LE
1531 #define wxMBConvUTF32swap wxMBConvUTF32BE
1532 #define wxMBConvUTF32straight wxMBConvUTF32LE
1536 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1537 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1540 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1542 if ( srcLen
== wxNO_LEN
)
1544 // count the number of bytes in input, including the trailing NULs
1545 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1546 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1549 srcLen
*= BYTES_PER_CHAR
;
1551 else // we already have the length
1553 // we can only convert an entire number of UTF-32 characters
1554 if ( srcLen
% BYTES_PER_CHAR
)
1555 return wxCONV_FAILED
;
1561 // case when in-memory representation is UTF-16
1564 // ----------------------------------------------------------------------------
1565 // conversions without endianness change
1566 // ----------------------------------------------------------------------------
1569 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1570 const char *src
, size_t srcLen
) const
1572 srcLen
= GetLength(src
, srcLen
);
1573 if ( srcLen
== wxNO_LEN
)
1574 return wxCONV_FAILED
;
1576 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1577 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1579 for ( size_t n
= 0; n
< inLen
; n
++ )
1582 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1583 if ( numChars
== wxCONV_FAILED
)
1584 return wxCONV_FAILED
;
1589 if ( outLen
> dstLen
)
1590 return wxCONV_FAILED
;
1593 if ( numChars
== 2 )
1595 // second character of a surrogate
1605 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1606 const wchar_t *src
, size_t srcLen
) const
1608 if ( srcLen
== wxNO_LEN
)
1609 srcLen
= wxWcslen(src
) + 1;
1613 // optimization: return maximal space which could be needed for this
1614 // string instead of the exact amount which could be less if there are
1615 // any surrogates in the input
1617 // we consider that surrogates are rare enough to make it worthwhile to
1618 // avoid running the loop below at the cost of slightly extra memory
1620 return srcLen
* BYTES_PER_CHAR
;
1623 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1625 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1627 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1629 return wxCONV_FAILED
;
1631 outLen
+= BYTES_PER_CHAR
;
1633 if ( outLen
> dstLen
)
1634 return wxCONV_FAILED
;
1642 // ----------------------------------------------------------------------------
1643 // endian-reversing conversions
1644 // ----------------------------------------------------------------------------
1647 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1648 const char *src
, size_t srcLen
) const
1650 srcLen
= GetLength(src
, srcLen
);
1651 if ( srcLen
== wxNO_LEN
)
1652 return wxCONV_FAILED
;
1654 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1655 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1657 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1660 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1661 if ( numChars
== wxCONV_FAILED
)
1662 return wxCONV_FAILED
;
1667 if ( outLen
> dstLen
)
1668 return wxCONV_FAILED
;
1671 if ( numChars
== 2 )
1673 // second character of a surrogate
1683 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1684 const wchar_t *src
, size_t srcLen
) const
1686 if ( srcLen
== wxNO_LEN
)
1687 srcLen
= wxWcslen(src
) + 1;
1691 // optimization: return maximal space which could be needed for this
1692 // string instead of the exact amount which could be less if there are
1693 // any surrogates in the input
1695 // we consider that surrogates are rare enough to make it worthwhile to
1696 // avoid running the loop below at the cost of slightly extra memory
1698 return srcLen
*BYTES_PER_CHAR
;
1701 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1703 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1705 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1707 return wxCONV_FAILED
;
1709 outLen
+= BYTES_PER_CHAR
;
1711 if ( outLen
> dstLen
)
1712 return wxCONV_FAILED
;
1714 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1720 #else // !WC_UTF16: wchar_t is UTF-32
1722 // ----------------------------------------------------------------------------
1723 // conversions without endianness change
1724 // ----------------------------------------------------------------------------
1727 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1728 const char *src
, size_t srcLen
) const
1730 // use memcpy() as it should be much faster than hand-written loop
1731 srcLen
= GetLength(src
, srcLen
);
1732 if ( srcLen
== wxNO_LEN
)
1733 return wxCONV_FAILED
;
1735 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1738 if ( dstLen
< inLen
)
1739 return wxCONV_FAILED
;
1741 memcpy(dst
, src
, srcLen
);
1748 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1749 const wchar_t *src
, size_t srcLen
) const
1751 if ( srcLen
== wxNO_LEN
)
1752 srcLen
= wxWcslen(src
) + 1;
1754 srcLen
*= BYTES_PER_CHAR
;
1758 if ( dstLen
< srcLen
)
1759 return wxCONV_FAILED
;
1761 memcpy(dst
, src
, srcLen
);
1767 // ----------------------------------------------------------------------------
1768 // endian-reversing conversions
1769 // ----------------------------------------------------------------------------
1772 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1773 const char *src
, size_t srcLen
) const
1775 srcLen
= GetLength(src
, srcLen
);
1776 if ( srcLen
== wxNO_LEN
)
1777 return wxCONV_FAILED
;
1779 srcLen
/= BYTES_PER_CHAR
;
1783 if ( dstLen
< srcLen
)
1784 return wxCONV_FAILED
;
1786 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1787 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1789 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1797 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1798 const wchar_t *src
, size_t srcLen
) const
1800 if ( srcLen
== wxNO_LEN
)
1801 srcLen
= wxWcslen(src
) + 1;
1803 srcLen
*= BYTES_PER_CHAR
;
1807 if ( dstLen
< srcLen
)
1808 return wxCONV_FAILED
;
1810 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1811 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1813 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1820 #endif // WC_UTF16/!WC_UTF16
1823 // ============================================================================
1824 // The classes doing conversion using the iconv_xxx() functions
1825 // ============================================================================
1829 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1830 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1831 // (unless there's yet another bug in glibc) the only case when iconv()
1832 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1833 // left in the input buffer -- when _real_ error occurs,
1834 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1836 // [This bug does not appear in glibc 2.2.]
1837 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1838 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1839 (errno != E2BIG || bufLeft != 0))
1841 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1844 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1846 #define ICONV_T_INVALID ((iconv_t)-1)
1848 #if SIZEOF_WCHAR_T == 4
1849 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1850 #define WC_ENC wxFONTENCODING_UTF32
1851 #elif SIZEOF_WCHAR_T == 2
1852 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1853 #define WC_ENC wxFONTENCODING_UTF16
1854 #else // sizeof(wchar_t) != 2 nor 4
1855 // does this ever happen?
1856 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1859 // ----------------------------------------------------------------------------
1860 // wxMBConv_iconv: encapsulates an iconv character set
1861 // ----------------------------------------------------------------------------
1863 class wxMBConv_iconv
: public wxMBConv
1866 wxMBConv_iconv(const char *name
);
1867 virtual ~wxMBConv_iconv();
1869 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1870 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1872 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1873 virtual size_t GetMBNulLen() const;
1875 #if wxUSE_UNICODE_UTF8
1876 virtual bool IsUTF8() const;
1879 virtual wxMBConv
*Clone() const
1881 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
1882 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1887 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1890 // the iconv handlers used to translate from multibyte
1891 // to wide char and in the other direction
1896 // guards access to m2w and w2m objects
1897 wxMutex m_iconvMutex
;
1901 // the name (for iconv_open()) of a wide char charset -- if none is
1902 // available on this machine, it will remain NULL
1903 static wxString ms_wcCharsetName
;
1905 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1906 // different endian-ness than the native one
1907 static bool ms_wcNeedsSwap
;
1910 // name of the encoding handled by this conversion
1913 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1915 size_t m_minMBCharWidth
;
1918 // make the constructor available for unit testing
1919 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
1921 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1922 if ( !result
->IsOk() )
1931 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1932 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1934 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
1937 m_minMBCharWidth
= 0;
1939 // check for charset that represents wchar_t:
1940 if ( ms_wcCharsetName
.empty() )
1942 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1945 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1946 #else // !wxUSE_FONTMAP
1947 static const wxChar
*names_static
[] =
1949 #if SIZEOF_WCHAR_T == 4
1951 #elif SIZEOF_WCHAR_T = 2
1956 const wxChar
**names
= names_static
;
1957 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1959 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1961 const wxString
nameCS(*names
);
1963 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1964 wxString
nameXE(nameCS
);
1966 #ifdef WORDS_BIGENDIAN
1968 #else // little endian
1972 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1975 m2w
= iconv_open(nameXE
.ToAscii(), name
);
1976 if ( m2w
== ICONV_T_INVALID
)
1978 // try charset w/o bytesex info (e.g. "UCS4")
1979 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1981 m2w
= iconv_open(nameCS
.ToAscii(), name
);
1983 // and check for bytesex ourselves:
1984 if ( m2w
!= ICONV_T_INVALID
)
1986 char buf
[2], *bufPtr
;
1987 wchar_t wbuf
[2], *wbufPtr
;
1995 outsz
= SIZEOF_WCHAR_T
* 2;
2000 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2001 (char**)&wbufPtr
, &outsz
);
2003 if (ICONV_FAILED(res
, insz
))
2005 wxLogLastError(wxT("iconv"));
2006 wxLogError(_("Conversion to charset '%s' doesn't work."),
2009 else // ok, can convert to this encoding, remember it
2011 ms_wcCharsetName
= nameCS
;
2012 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2016 else // use charset not requiring byte swapping
2018 ms_wcCharsetName
= nameXE
;
2022 wxLogTrace(TRACE_STRCONV
,
2023 wxT("iconv wchar_t charset is \"%s\"%s"),
2024 ms_wcCharsetName
.empty() ? wxString("<none>")
2026 ms_wcNeedsSwap
? _T(" (needs swap)")
2029 else // we already have ms_wcCharsetName
2031 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2034 if ( ms_wcCharsetName
.empty() )
2036 w2m
= ICONV_T_INVALID
;
2040 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2041 if ( w2m
== ICONV_T_INVALID
)
2043 wxLogTrace(TRACE_STRCONV
,
2044 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2045 ms_wcCharsetName
.c_str(), name
);
2050 wxMBConv_iconv::~wxMBConv_iconv()
2052 if ( m2w
!= ICONV_T_INVALID
)
2054 if ( w2m
!= ICONV_T_INVALID
)
2058 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2060 // find the string length: notice that must be done differently for
2061 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2063 const size_t nulLen
= GetMBNulLen();
2067 return wxCONV_FAILED
;
2070 inbuf
= strlen(psz
); // arguably more optimized than our version
2075 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2076 // they also have to start at character boundary and not span two
2077 // adjacent characters
2079 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2086 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2087 // Unfortunately there are a couple of global wxCSConv objects such as
2088 // wxConvLocal that are used all over wx code, so we have to make sure
2089 // the handle is used by at most one thread at the time. Otherwise
2090 // only a few wx classes would be safe to use from non-main threads
2091 // as MB<->WC conversion would fail "randomly".
2092 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2093 #endif // wxUSE_THREADS
2095 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
2097 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2098 wchar_t *bufPtr
= buf
;
2099 const char *pszPtr
= psz
;
2103 // have destination buffer, convert there
2105 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
2106 (char**)&bufPtr
, &outbuf
);
2107 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
2111 // convert to native endianness
2112 for ( unsigned i
= 0; i
< res
; i
++ )
2113 buf
[n
] = WC_BSWAP(buf
[i
]);
2116 // NUL-terminate the string if there is any space left
2122 // no destination buffer... convert using temp buffer
2123 // to calculate destination buffer requirement
2130 outbuf
= 8 * SIZEOF_WCHAR_T
;
2133 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
2134 (char**)&bufPtr
, &outbuf
);
2136 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
2138 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2141 if (ICONV_FAILED(cres
, inbuf
))
2143 //VS: it is ok if iconv fails, hence trace only
2144 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2145 return wxCONV_FAILED
;
2151 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2154 // NB: explained in MB2WC
2155 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2158 size_t inlen
= wxWcslen(psz
);
2159 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
2163 wchar_t *tmpbuf
= 0;
2167 // need to copy to temp buffer to switch endianness
2168 // (doing WC_BSWAP twice on the original buffer won't help, as it
2169 // could be in read-only memory, or be accessed in some other thread)
2170 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
2171 for ( size_t i
= 0; i
< inlen
; i
++ )
2172 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
2174 tmpbuf
[inlen
] = L
'\0';
2180 // have destination buffer, convert there
2181 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
2185 // NB: iconv was given only wcslen(psz) characters on input, and so
2186 // it couldn't convert the trailing zero. Let's do it ourselves
2187 // if there's some room left for it in the output buffer.
2193 // no destination buffer: convert using temp buffer
2194 // to calculate destination buffer requirement
2202 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
2206 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2214 if (ICONV_FAILED(cres
, inbuf
))
2216 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2217 return wxCONV_FAILED
;
2223 size_t wxMBConv_iconv::GetMBNulLen() const
2225 if ( m_minMBCharWidth
== 0 )
2227 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2230 // NB: explained in MB2WC
2231 wxMutexLocker
lock(self
->m_iconvMutex
);
2234 const wchar_t *wnul
= L
"";
2235 char buf
[8]; // should be enough for NUL in any encoding
2236 size_t inLen
= sizeof(wchar_t),
2237 outLen
= WXSIZEOF(buf
);
2238 char *inBuff
= (char *)wnul
;
2239 char *outBuff
= buf
;
2240 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2242 self
->m_minMBCharWidth
= (size_t)-1;
2246 self
->m_minMBCharWidth
= outBuff
- buf
;
2250 return m_minMBCharWidth
;
2253 #if wxUSE_UNICODE_UTF8
2254 bool wxMBConv_iconv::IsUTF8() const
2256 return wxStricmp(m_name
, "UTF-8") == 0 ||
2257 wxStricmp(m_name
, "UTF8") == 0;
2261 #endif // HAVE_ICONV
2264 // ============================================================================
2265 // Win32 conversion classes
2266 // ============================================================================
2268 #ifdef wxHAVE_WIN32_MB2WC
2272 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2273 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2276 class wxMBConv_win32
: public wxMBConv
2281 m_CodePage
= CP_ACP
;
2282 m_minMBCharWidth
= 0;
2285 wxMBConv_win32(const wxMBConv_win32
& conv
)
2288 m_CodePage
= conv
.m_CodePage
;
2289 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2293 wxMBConv_win32(const char* name
)
2295 m_CodePage
= wxCharsetToCodepage(name
);
2296 m_minMBCharWidth
= 0;
2299 wxMBConv_win32(wxFontEncoding encoding
)
2301 m_CodePage
= wxEncodingToCodepage(encoding
);
2302 m_minMBCharWidth
= 0;
2304 #endif // wxUSE_FONTMAP
2306 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2308 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2309 // the behaviour is not compatible with the Unix version (using iconv)
2310 // and break the library itself, e.g. wxTextInputStream::NextChar()
2311 // wouldn't work if reading an incomplete MB char didn't result in an
2314 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2315 // Win XP or newer and it is not supported for UTF-[78] so we always
2316 // use our own conversions in this case. See
2317 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2318 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2319 if ( m_CodePage
== CP_UTF8
)
2321 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2324 if ( m_CodePage
== CP_UTF7
)
2326 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2330 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2331 IsAtLeastWin2kSP4() )
2333 flags
= MB_ERR_INVALID_CHARS
;
2336 const size_t len
= ::MultiByteToWideChar
2338 m_CodePage
, // code page
2339 flags
, // flags: fall on error
2340 psz
, // input string
2341 -1, // its length (NUL-terminated)
2342 buf
, // output string
2343 buf
? n
: 0 // size of output buffer
2347 // function totally failed
2348 return wxCONV_FAILED
;
2351 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2352 // check if we succeeded, by doing a double trip:
2353 if ( !flags
&& buf
)
2355 const size_t mbLen
= strlen(psz
);
2356 wxCharBuffer
mbBuf(mbLen
);
2357 if ( ::WideCharToMultiByte
2364 mbLen
+ 1, // size in bytes, not length
2368 strcmp(mbBuf
, psz
) != 0 )
2370 // we didn't obtain the same thing we started from, hence
2371 // the conversion was lossy and we consider that it failed
2372 return wxCONV_FAILED
;
2376 // note that it returns count of written chars for buf != NULL and size
2377 // of the needed buffer for buf == NULL so in either case the length of
2378 // the string (which never includes the terminating NUL) is one less
2382 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2385 we have a problem here: by default, WideCharToMultiByte() may
2386 replace characters unrepresentable in the target code page with bad
2387 quality approximations such as turning "1/2" symbol (U+00BD) into
2388 "1" for the code pages which don't have it and we, obviously, want
2389 to avoid this at any price
2391 the trouble is that this function does it _silently_, i.e. it won't
2392 even tell us whether it did or not... Win98/2000 and higher provide
2393 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2394 we have to resort to a round trip, i.e. check that converting back
2395 results in the same string -- this is, of course, expensive but
2396 otherwise we simply can't be sure to not garble the data.
2399 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2400 // it doesn't work with CJK encodings (which we test for rather roughly
2401 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2403 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2406 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2408 // it's our lucky day
2409 flags
= WC_NO_BEST_FIT_CHARS
;
2410 pUsedDef
= &usedDef
;
2412 else // old system or unsupported encoding
2418 const size_t len
= ::WideCharToMultiByte
2420 m_CodePage
, // code page
2421 flags
, // either none or no best fit
2422 pwz
, // input string
2423 -1, // it is (wide) NUL-terminated
2424 buf
, // output buffer
2425 buf
? n
: 0, // and its size
2426 NULL
, // default "replacement" char
2427 pUsedDef
// [out] was it used?
2432 // function totally failed
2433 return wxCONV_FAILED
;
2436 // we did something, check if we really succeeded
2439 // check if the conversion failed, i.e. if any replacements
2442 return wxCONV_FAILED
;
2444 else // we must resort to double tripping...
2446 // first we need to ensure that we really have the MB data: this is
2447 // not the case if we're called with NULL buffer, in which case we
2448 // need to do the conversion yet again
2449 wxCharBuffer bufDef
;
2452 bufDef
= wxCharBuffer(len
);
2453 buf
= bufDef
.data();
2454 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2455 buf
, len
, NULL
, NULL
) )
2456 return wxCONV_FAILED
;
2459 wxWCharBuffer
wcBuf(n
);
2460 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2461 wcscmp(wcBuf
, pwz
) != 0 )
2463 // we didn't obtain the same thing we started from, hence
2464 // the conversion was lossy and we consider that it failed
2465 return wxCONV_FAILED
;
2469 // see the comment above for the reason of "len - 1"
2473 virtual size_t GetMBNulLen() const
2475 if ( m_minMBCharWidth
== 0 )
2477 int len
= ::WideCharToMultiByte
2479 m_CodePage
, // code page
2481 L
"", // input string
2482 1, // translate just the NUL
2483 NULL
, // output buffer
2485 NULL
, // no replacement char
2486 NULL
// [out] don't care if it was used
2489 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2493 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2494 self
->m_minMBCharWidth
= (size_t)-1;
2498 self
->m_minMBCharWidth
= (size_t)-1;
2504 self
->m_minMBCharWidth
= len
;
2509 return m_minMBCharWidth
;
2512 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2514 bool IsOk() const { return m_CodePage
!= -1; }
2517 static bool CanUseNoBestFit()
2519 static int s_isWin98Or2k
= -1;
2521 if ( s_isWin98Or2k
== -1 )
2524 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2526 case wxOS_WINDOWS_9X
:
2527 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2530 case wxOS_WINDOWS_NT
:
2531 s_isWin98Or2k
= verMaj
>= 5;
2535 // unknown: be conservative by default
2540 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2543 return s_isWin98Or2k
== 1;
2546 static bool IsAtLeastWin2kSP4()
2551 static int s_isAtLeastWin2kSP4
= -1;
2553 if ( s_isAtLeastWin2kSP4
== -1 )
2555 OSVERSIONINFOEX ver
;
2557 memset(&ver
, 0, sizeof(ver
));
2558 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2559 GetVersionEx((OSVERSIONINFO
*)&ver
);
2561 s_isAtLeastWin2kSP4
=
2562 ((ver
.dwMajorVersion
> 5) || // Vista+
2563 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2564 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2565 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2569 return s_isAtLeastWin2kSP4
== 1;
2574 // the code page we're working with
2577 // cached result of GetMBNulLen(), set to 0 initially meaning
2579 size_t m_minMBCharWidth
;
2582 #endif // wxHAVE_WIN32_MB2WC
2585 // ============================================================================
2586 // wxEncodingConverter based conversion classes
2587 // ============================================================================
2591 class wxMBConv_wxwin
: public wxMBConv
2596 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2597 // The wxMBConv_cf class does a better job.
2598 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2599 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2600 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2604 // temporarily just use wxEncodingConverter stuff,
2605 // so that it works while a better implementation is built
2606 wxMBConv_wxwin(const char* name
)
2609 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2611 m_enc
= wxFONTENCODING_SYSTEM
;
2616 wxMBConv_wxwin(wxFontEncoding enc
)
2623 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2625 size_t inbuf
= strlen(psz
);
2628 if (!m2w
.Convert(psz
, buf
))
2629 return wxCONV_FAILED
;
2634 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2636 const size_t inbuf
= wxWcslen(psz
);
2639 if (!w2m
.Convert(psz
, buf
))
2640 return wxCONV_FAILED
;
2646 virtual size_t GetMBNulLen() const
2650 case wxFONTENCODING_UTF16BE
:
2651 case wxFONTENCODING_UTF16LE
:
2654 case wxFONTENCODING_UTF32BE
:
2655 case wxFONTENCODING_UTF32LE
:
2663 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2665 bool IsOk() const { return m_ok
; }
2668 wxFontEncoding m_enc
;
2669 wxEncodingConverter m2w
, w2m
;
2672 // were we initialized successfully?
2675 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2678 // make the constructors available for unit testing
2679 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2681 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2682 if ( !result
->IsOk() )
2691 #endif // wxUSE_FONTMAP
2693 // ============================================================================
2694 // wxCSConv implementation
2695 // ============================================================================
2697 void wxCSConv::Init()
2704 wxCSConv::wxCSConv(const wxString
& charset
)
2708 if ( !charset
.empty() )
2710 SetName(charset
.ToAscii());
2714 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2716 m_encoding
= wxFONTENCODING_SYSTEM
;
2720 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2722 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2724 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2726 encoding
= wxFONTENCODING_SYSTEM
;
2731 m_encoding
= encoding
;
2734 wxCSConv::~wxCSConv()
2739 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2744 SetName(conv
.m_name
);
2745 m_encoding
= conv
.m_encoding
;
2748 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2752 SetName(conv
.m_name
);
2753 m_encoding
= conv
.m_encoding
;
2758 void wxCSConv::Clear()
2767 void wxCSConv::SetName(const char *charset
)
2771 m_name
= wxStrdup(charset
);
2778 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2779 wxEncodingNameCache
);
2781 static wxEncodingNameCache gs_nameCache
;
2784 wxMBConv
*wxCSConv::DoCreate() const
2787 wxLogTrace(TRACE_STRCONV
,
2788 wxT("creating conversion for %s"),
2790 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2791 #endif // wxUSE_FONTMAP
2793 // check for the special case of ASCII or ISO8859-1 charset: as we have
2794 // special knowledge of it anyhow, we don't need to create a special
2795 // conversion object
2796 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2797 m_encoding
== wxFONTENCODING_DEFAULT
)
2799 // don't convert at all
2803 // we trust OS to do conversion better than we can so try external
2804 // conversion methods first
2806 // the full order is:
2807 // 1. OS conversion (iconv() under Unix or Win32 API)
2808 // 2. hard coded conversions for UTF
2809 // 3. wxEncodingConverter as fall back
2815 #endif // !wxUSE_FONTMAP
2818 wxFontEncoding
encoding(m_encoding
);
2823 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2831 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2832 #endif // wxUSE_FONTMAP
2836 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2837 if ( it
!= gs_nameCache
.end() )
2839 if ( it
->second
.empty() )
2842 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
2849 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2850 // CS : in case this does not return valid names (eg for MacRoman)
2851 // encoding got a 'failure' entry in the cache all the same,
2852 // although it just has to be created using a different method, so
2853 // only store failed iconv creation attempts (or perhaps we
2854 // shoulnd't do this at all ?)
2855 if ( names
[0] != NULL
)
2857 for ( ; *names
; ++names
)
2859 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2860 // will need changes that will obsolete this
2861 wxString
name(*names
);
2862 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
2865 gs_nameCache
[encoding
] = *names
;
2872 gs_nameCache
[encoding
] = _T(""); // cache the failure
2875 #endif // wxUSE_FONTMAP
2877 #endif // HAVE_ICONV
2879 #ifdef wxHAVE_WIN32_MB2WC
2882 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2883 : new wxMBConv_win32(m_encoding
);
2892 #endif // wxHAVE_WIN32_MB2WC
2896 // leave UTF16 and UTF32 to the built-ins of wx
2897 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2898 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2901 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
2902 : new wxMBConv_cf(m_encoding
);
2904 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
2913 #endif // __DARWIN__
2916 wxFontEncoding enc
= m_encoding
;
2918 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2920 // use "false" to suppress interactive dialogs -- we can be called from
2921 // anywhere and popping up a dialog from here is the last thing we want to
2923 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2925 #endif // wxUSE_FONTMAP
2929 case wxFONTENCODING_UTF7
:
2930 return new wxMBConvUTF7
;
2932 case wxFONTENCODING_UTF8
:
2933 return new wxMBConvUTF8
;
2935 case wxFONTENCODING_UTF16BE
:
2936 return new wxMBConvUTF16BE
;
2938 case wxFONTENCODING_UTF16LE
:
2939 return new wxMBConvUTF16LE
;
2941 case wxFONTENCODING_UTF32BE
:
2942 return new wxMBConvUTF32BE
;
2944 case wxFONTENCODING_UTF32LE
:
2945 return new wxMBConvUTF32LE
;
2948 // nothing to do but put here to suppress gcc warnings
2955 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2956 : new wxMBConv_wxwin(m_encoding
);
2962 #endif // wxUSE_FONTMAP
2964 // NB: This is a hack to prevent deadlock. What could otherwise happen
2965 // in Unicode build: wxConvLocal creation ends up being here
2966 // because of some failure and logs the error. But wxLog will try to
2967 // attach a timestamp, for which it will need wxConvLocal (to convert
2968 // time to char* and then wchar_t*), but that fails, tries to log the
2969 // error, but wxLog has an (already locked) critical section that
2970 // guards the static buffer.
2971 static bool alreadyLoggingError
= false;
2972 if (!alreadyLoggingError
)
2974 alreadyLoggingError
= true;
2975 wxLogError(_("Cannot convert from the charset '%s'!"),
2979 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
2980 #else // !wxUSE_FONTMAP
2981 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
2982 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2985 alreadyLoggingError
= false;
2991 void wxCSConv::CreateConvIfNeeded() const
2995 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
2997 // if we don't have neither the name nor the encoding, use the default
2998 // encoding for this system
2999 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3002 self
->m_encoding
= wxLocale::GetSystemEncoding();
3004 // fallback to some reasonable default:
3005 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3006 #endif // wxUSE_INTL
3009 self
->m_convReal
= DoCreate();
3010 self
->m_deferred
= false;
3014 bool wxCSConv::IsOk() const
3016 CreateConvIfNeeded();
3018 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3019 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3020 return true; // always ok as we do it ourselves
3022 // m_convReal->IsOk() is called at its own creation, so we know it must
3023 // be ok if m_convReal is non-NULL
3024 return m_convReal
!= NULL
;
3027 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3028 const char *src
, size_t srcLen
) const
3030 CreateConvIfNeeded();
3033 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3036 return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
);
3039 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3040 const wchar_t *src
, size_t srcLen
) const
3042 CreateConvIfNeeded();
3045 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3048 return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
);
3051 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3053 CreateConvIfNeeded();
3056 return m_convReal
->MB2WC(buf
, psz
, n
);
3059 size_t len
= strlen(psz
);
3063 for (size_t c
= 0; c
<= len
; c
++)
3064 buf
[c
] = (unsigned char)(psz
[c
]);
3070 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3072 CreateConvIfNeeded();
3075 return m_convReal
->WC2MB(buf
, psz
, n
);
3078 const size_t len
= wxWcslen(psz
);
3081 for (size_t c
= 0; c
<= len
; c
++)
3084 return wxCONV_FAILED
;
3086 buf
[c
] = (char)psz
[c
];
3091 for (size_t c
= 0; c
<= len
; c
++)
3094 return wxCONV_FAILED
;
3101 size_t wxCSConv::GetMBNulLen() const
3103 CreateConvIfNeeded();
3107 return m_convReal
->GetMBNulLen();
3110 // otherwise, we are ISO-8859-1
3114 #if wxUSE_UNICODE_UTF8
3115 bool wxCSConv::IsUTF8() const
3117 CreateConvIfNeeded();
3121 return m_convReal
->IsUTF8();
3124 // otherwise, we are ISO-8859-1
3132 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3135 return wxWCharBuffer();
3137 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3139 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3141 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3146 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3149 return wxCharBuffer();
3151 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3153 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3158 #endif // wxUSE_UNICODE
3160 // ----------------------------------------------------------------------------
3162 // ----------------------------------------------------------------------------
3164 // NB: The reason why we create converted objects in this convoluted way,
3165 // using a factory function instead of global variable, is that they
3166 // may be used at static initialization time (some of them are used by
3167 // wxString ctors and there may be a global wxString object). In other
3168 // words, possibly _before_ the converter global object would be
3175 #undef wxConvISO8859_1
3177 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3178 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3179 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3181 static impl_klass name##Obj ctor_args; \
3182 return &name##Obj; \
3184 /* this ensures that all global converter objects are created */ \
3185 /* by the time static initialization is done, i.e. before any */ \
3186 /* thread is launched: */ \
3187 static klass* gs_##name##instance = wxGet_##name##Ptr()
3189 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3190 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3193 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3195 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3198 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3199 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3200 // provokes an error message about "not enough macro parameters"; and we
3201 // can't use "()" here as the name##Obj declaration would be parsed as a
3202 // function declaration then, so use a semicolon and live with an extra
3203 // empty statement (and hope that no compilers warns about this)
3204 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3205 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3207 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3208 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3210 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3211 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3214 // The xnu kernel always communicates file paths in decomposed UTF-8.
3215 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3216 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3219 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3222 #else // !__DARWIN__
3223 wxGet_wxConvLibcPtr();
3224 #endif // __DARWIN__/!__DARWIN__
3226 #else // !wxUSE_WCHAR_T
3228 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3229 // stand-ins in absence of wchar_t
3230 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3235 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T