1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
53 #include "wx/thread.h"
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
64 #define TRACE_STRCONV _T("strconv")
66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
68 #if SIZEOF_WCHAR_T == 2
73 // ============================================================================
75 // ============================================================================
77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
78 static bool NotAllNULs(const char *p
, size_t n
)
80 while ( n
&& *p
++ == '\0' )
86 // ----------------------------------------------------------------------------
87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
88 // ----------------------------------------------------------------------------
90 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
95 *output
= (wxUint16
) input
;
99 else if (input
>= 0x110000)
101 return wxCONV_FAILED
;
107 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
108 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
115 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
117 if ((*input
< 0xd800) || (*input
> 0xdfff))
122 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
125 return wxCONV_FAILED
;
129 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
135 typedef wchar_t wxDecodeSurrogate_t
;
137 typedef wxUint16 wxDecodeSurrogate_t
;
138 #endif // WC_UTF16/!WC_UTF16
140 // returns the next UTF-32 character from the wchar_t buffer and advances the
141 // pointer to the character after this one
143 // if an invalid character is found, *pSrc is set to NULL, the caller must
145 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
149 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
150 if ( n
== wxCONV_FAILED
)
158 // ----------------------------------------------------------------------------
160 // ----------------------------------------------------------------------------
163 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
164 const char *src
, size_t srcLen
) const
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten
= 0;
176 // the number of NULs terminating this string
177 size_t nulLen
= 0; // not really needed, but just to avoid warnings
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
185 if ( srcLen
!= wxNO_LEN
)
187 // we need to know how to find the end of this string
188 nulLen
= GetMBNulLen();
189 if ( nulLen
== wxCONV_FAILED
)
190 return wxCONV_FAILED
;
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
197 char * const p
= bufTmp
.data();
198 memcpy(p
, src
, srcLen
);
199 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
205 srcEnd
= src
+ srcLen
;
207 else // quit after the first loop iteration
214 // try to convert the current chunk
215 size_t lenChunk
= MB2WC(NULL
, src
, 0);
216 if ( lenChunk
== wxCONV_FAILED
)
217 return wxCONV_FAILED
;
219 lenChunk
++; // for the L'\0' at the end of this chunk
221 dstWritten
+= lenChunk
;
225 // nothing left in the input string, conversion succeeded
231 if ( dstWritten
> dstLen
)
232 return wxCONV_FAILED
;
234 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
235 return wxCONV_FAILED
;
242 // we convert just one chunk in this case as this is the entire
247 // advance the input pointer past the end of this chunk
248 while ( NotAllNULs(src
, nulLen
) )
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
257 src
+= nulLen
; // skipping over its terminator as well
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
270 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
271 const wchar_t *src
, size_t srcLen
) const
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten
= 0;
276 // make a copy of the input string unless it is already properly
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp
;
282 if ( srcLen
== wxNO_LEN
)
284 srcLen
= wxWcslen(src
) + 1;
286 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
288 // make a copy in order to properly NUL-terminate the string
289 bufTmp
= wxWCharBuffer(srcLen
);
290 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
294 const size_t lenNul
= GetMBNulLen();
295 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
297 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
299 // try to convert the current chunk
300 size_t lenChunk
= WC2MB(NULL
, src
, 0);
302 if ( lenChunk
== wxCONV_FAILED
)
303 return wxCONV_FAILED
;
306 dstWritten
+= lenChunk
;
310 if ( dstWritten
> dstLen
)
311 return wxCONV_FAILED
;
313 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
314 return wxCONV_FAILED
;
323 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
325 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
326 if ( rc
!= wxCONV_FAILED
)
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
336 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
338 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
339 if ( rc
!= wxCONV_FAILED
)
347 wxMBConv::~wxMBConv()
349 // nothing to do here (necessary for Darwin linking probably)
352 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
356 // calculate the length of the buffer needed first
357 const size_t nLen
= MB2WC(NULL
, psz
, 0);
358 if ( nLen
!= wxCONV_FAILED
)
360 // now do the actual conversion
361 wxWCharBuffer
buf(nLen
/* +1 added implicitly */);
363 // +1 for the trailing NULL
364 if ( MB2WC(buf
.data(), psz
, nLen
+ 1) != wxCONV_FAILED
)
369 return wxWCharBuffer();
372 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
376 const size_t nLen
= WC2MB(NULL
, pwz
, 0);
377 if ( nLen
!= wxCONV_FAILED
)
379 // extra space for trailing NUL(s)
380 static const size_t extraLen
= GetMaxMBNulLen();
382 wxCharBuffer
buf(nLen
+ extraLen
- 1);
383 if ( WC2MB(buf
.data(), pwz
, nLen
+ extraLen
) != wxCONV_FAILED
)
388 return wxCharBuffer();
392 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
394 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
395 if ( dstLen
!= wxCONV_FAILED
)
397 wxWCharBuffer
wbuf(dstLen
- 1);
398 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
403 if ( wbuf
[dstLen
- 1] == L
'\0' )
414 return wxWCharBuffer();
418 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
420 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
421 if ( dstLen
!= wxCONV_FAILED
)
423 // special case of empty input: can't allocate 0 size buffer below as
424 // wxCharBuffer insists on NUL-terminating it
425 wxCharBuffer
buf(dstLen
? dstLen
- 1 : 1);
426 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
432 const size_t nulLen
= GetMBNulLen();
433 if ( dstLen
>= nulLen
&&
434 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
436 // in this case the output is NUL-terminated and we're not
437 // supposed to count NUL
449 return wxCharBuffer();
452 // ----------------------------------------------------------------------------
454 // ----------------------------------------------------------------------------
456 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
458 return wxMB2WC(buf
, psz
, n
);
461 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
463 return wxWC2MB(buf
, psz
, n
);
466 // ----------------------------------------------------------------------------
467 // wxConvBrokenFileNames
468 // ----------------------------------------------------------------------------
472 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
474 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
475 wxStricmp(charset
, _T("UTF8")) == 0 )
476 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
478 m_conv
= new wxCSConv(charset
);
483 // ----------------------------------------------------------------------------
485 // ----------------------------------------------------------------------------
487 // Implementation (C) 2004 Fredrik Roubert
490 // BASE64 decoding table
492 static const unsigned char utf7unb64
[] =
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
500 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
501 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
502 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
503 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
504 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
505 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
506 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
507 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
508 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
509 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
528 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
532 while ( *psz
&& (!buf
|| (len
< n
)) )
534 unsigned char cc
= *psz
++;
542 else if (*psz
== '-')
550 else // start of BASE64 encoded string
554 for ( ok
= lsb
= false, d
= 0, l
= 0;
555 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
560 for (l
+= 6; l
>= 8; lsb
= !lsb
)
562 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
572 *buf
= (wchar_t)(c
<< 8);
581 // in valid UTF7 we should have valid characters after '+'
582 return wxCONV_FAILED
;
590 if ( buf
&& (len
< n
) )
597 // BASE64 encoding table
599 static const unsigned char utf7enb64
[] =
601 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
602 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
603 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
604 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
605 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
606 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
607 'w', 'x', 'y', 'z', '0', '1', '2', '3',
608 '4', '5', '6', '7', '8', '9', '+', '/'
612 // UTF-7 encoding table
614 // 0 - Set D (directly encoded characters)
615 // 1 - Set O (optional direct characters)
616 // 2 - whitespace characters (optional)
617 // 3 - special characters
619 static const unsigned char utf7encode
[128] =
621 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
622 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
623 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
625 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
627 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
631 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
635 while (*psz
&& ((!buf
) || (len
< n
)))
638 if (cc
< 0x80 && utf7encode
[cc
] < 1)
647 else if (((wxUint32
)cc
) > 0xffff)
649 // no surrogate pair generation (yet?)
650 return wxCONV_FAILED
;
661 // BASE64 encode string
662 unsigned int lsb
, d
, l
;
663 for (d
= 0, l
= 0; /*nothing*/; psz
++)
665 for (lsb
= 0; lsb
< 2; lsb
++)
668 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
670 for (l
+= 8; l
>= 6; )
674 *buf
++ = utf7enb64
[(d
>> l
) % 64];
680 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
687 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
699 if (buf
&& (len
< n
))
705 // ----------------------------------------------------------------------------
707 // ----------------------------------------------------------------------------
709 static wxUint32 utf8_max
[]=
710 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
712 // boundaries of the private use area we use to (temporarily) remap invalid
713 // characters invalid in a UTF-8 encoded string
714 const wxUint32 wxUnicodePUA
= 0x100000;
715 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
717 // this table gives the length of the UTF-8 encoding from its first character:
718 unsigned char tableUtf8Lengths
[256] = {
719 // single-byte sequences (ASCII):
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
726 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
727 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
729 // these are invalid:
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
733 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
736 // two-byte sequences:
737 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
738 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
740 // three-byte sequences:
741 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
743 // four-byte sequences:
744 4, 4, 4, 4, 4, // F0..F4
746 // these are invalid again (5- or 6-byte
747 // sequences and sequences for code points
748 // above U+10FFFF, as restricted by RFC 3629):
749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
753 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
754 const char *src
, size_t srcLen
) const
756 wchar_t *out
= dstLen
? dst
: NULL
;
759 if ( srcLen
== wxNO_LEN
)
760 srcLen
= strlen(src
) + 1;
762 for ( const char *p
= src
; ; p
++ )
764 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
766 // all done successfully, just add the trailing NULL if we are not
767 // using explicit length
768 if ( srcLen
== wxNO_LEN
)
784 unsigned char c
= *p
;
785 unsigned len
= tableUtf8Lengths
[c
];
789 if ( srcLen
< len
) // the test works for wxNO_LEN too
792 if ( srcLen
!= wxNO_LEN
)
795 if ( out
&& !dstLen
-- )
799 // Char. number range | UTF-8 octet sequence
800 // (hexadecimal) | (binary)
801 // ----------------------+---------------------------------------------
802 // 0000 0000 - 0000 007F | 0xxxxxxx
803 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
804 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
805 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
807 // Code point value is stored in bits marked with 'x', lowest-order bit
808 // of the value on the right side in the diagram above.
811 // mask to extract lead byte's value ('x' bits above), by sequence length:
812 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
814 // mask and value of lead byte's most significant bits, by length:
815 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
816 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
818 len
--; // it's more convenient to work with 0-based length here
820 // extract the lead byte's value bits:
821 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
824 wxUint32 code
= c
& leadValueMask
[len
];
826 // all remaining bytes, if any, are handled in the same way regardless of
827 // sequence's length:
831 if ( (c
& 0xC0) != 0x80 )
832 return wxCONV_FAILED
;
839 // cast is ok because wchar_t == wxUint16 if WC_UTF16
840 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
849 #endif // WC_UTF16/!WC_UTF16
857 return wxCONV_FAILED
;
861 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
862 const wchar_t *src
, size_t srcLen
) const
864 char *out
= dstLen
? dst
: NULL
;
867 for ( const wchar_t *wp
= src
; ; wp
++ )
869 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
--) )
871 // all done successfully, just add the trailing NULL if we are not
872 // using explicit length
873 if ( srcLen
== wxNO_LEN
)
892 // cast is ok for WC_UTF16
893 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
895 // skip the next char too as we decoded a surrogate
898 #else // wchar_t is UTF-32
899 code
= *wp
& 0x7fffffff;
914 else if ( code
<= 0x07FF )
922 // NB: this line takes 6 least significant bits, encodes them as
923 // 10xxxxxx and discards them so that the next byte can be encoded:
924 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
925 out
[0] = 0xC0 | code
;
928 else if ( code
< 0xFFFF )
936 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
937 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
938 out
[0] = 0xE0 | code
;
941 else if ( code
<= 0x10FFFF )
949 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
950 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
951 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
952 out
[0] = 0xF0 | code
;
957 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
970 // we only get here if an error occurs during decoding
971 return wxCONV_FAILED
;
974 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
976 if ( m_options
== MAP_INVALID_UTF8_NOT
)
977 return wxMBConvStrictUTF8::MB2WC(buf
, psz
, n
);
981 while (*psz
&& ((!buf
) || (len
< n
)))
983 const char *opsz
= psz
;
984 bool invalid
= false;
985 unsigned char cc
= *psz
++, fc
= cc
;
987 for (cnt
= 0; fc
& 0x80; cnt
++)
997 // escape the escape character for octal escapes
998 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
999 && cc
== '\\' && (!buf
|| len
< n
))
1011 // invalid UTF-8 sequence
1016 unsigned ocnt
= cnt
- 1;
1017 wxUint32 res
= cc
& (0x3f >> cnt
);
1021 if ((cc
& 0xC0) != 0x80)
1023 // invalid UTF-8 sequence
1029 res
= (res
<< 6) | (cc
& 0x3f);
1032 if (invalid
|| res
<= utf8_max
[ocnt
])
1034 // illegal UTF-8 encoding
1037 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1038 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1040 // if one of our PUA characters turns up externally
1041 // it must also be treated as an illegal sequence
1042 // (a bit like you have to escape an escape character)
1048 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1049 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1050 if (pa
== wxCONV_FAILED
)
1062 *buf
++ = (wchar_t)res
;
1064 #endif // WC_UTF16/!WC_UTF16
1070 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1072 while (opsz
< psz
&& (!buf
|| len
< n
))
1075 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1076 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1077 wxASSERT(pa
!= wxCONV_FAILED
);
1084 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1090 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1092 while (opsz
< psz
&& (!buf
|| len
< n
))
1094 if ( buf
&& len
+ 3 < n
)
1096 unsigned char on
= *opsz
;
1098 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1099 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1100 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1107 else // MAP_INVALID_UTF8_NOT
1109 return wxCONV_FAILED
;
1115 if (buf
&& (len
< n
))
1121 static inline bool isoctal(wchar_t wch
)
1123 return L
'0' <= wch
&& wch
<= L
'7';
1126 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1128 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1129 return wxMBConvStrictUTF8::WC2MB(buf
, psz
, n
);
1133 while (*psz
&& ((!buf
) || (len
< n
)))
1138 // cast is ok for WC_UTF16
1139 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1140 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1142 cc
= (*psz
++) & 0x7fffffff;
1145 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1146 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1149 *buf
++ = (char)(cc
- wxUnicodePUA
);
1152 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1153 && cc
== L
'\\' && psz
[0] == L
'\\' )
1160 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1162 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1166 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1167 (psz
[1] - L
'0') * 010 +
1177 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1193 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1195 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1201 if (buf
&& (len
< n
))
1207 // ============================================================================
1209 // ============================================================================
1211 #ifdef WORDS_BIGENDIAN
1212 #define wxMBConvUTF16straight wxMBConvUTF16BE
1213 #define wxMBConvUTF16swap wxMBConvUTF16LE
1215 #define wxMBConvUTF16swap wxMBConvUTF16BE
1216 #define wxMBConvUTF16straight wxMBConvUTF16LE
1220 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1222 if ( srcLen
== wxNO_LEN
)
1224 // count the number of bytes in input, including the trailing NULs
1225 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1226 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1229 srcLen
*= BYTES_PER_CHAR
;
1231 else // we already have the length
1233 // we can only convert an entire number of UTF-16 characters
1234 if ( srcLen
% BYTES_PER_CHAR
)
1235 return wxCONV_FAILED
;
1241 // case when in-memory representation is UTF-16 too
1244 // ----------------------------------------------------------------------------
1245 // conversions without endianness change
1246 // ----------------------------------------------------------------------------
1249 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1250 const char *src
, size_t srcLen
) const
1252 // set up the scene for using memcpy() (which is presumably more efficient
1253 // than copying the bytes one by one)
1254 srcLen
= GetLength(src
, srcLen
);
1255 if ( srcLen
== wxNO_LEN
)
1256 return wxCONV_FAILED
;
1258 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1261 if ( dstLen
< inLen
)
1262 return wxCONV_FAILED
;
1264 memcpy(dst
, src
, srcLen
);
1271 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1272 const wchar_t *src
, size_t srcLen
) const
1274 if ( srcLen
== wxNO_LEN
)
1275 srcLen
= wxWcslen(src
) + 1;
1277 srcLen
*= BYTES_PER_CHAR
;
1281 if ( dstLen
< srcLen
)
1282 return wxCONV_FAILED
;
1284 memcpy(dst
, src
, srcLen
);
1290 // ----------------------------------------------------------------------------
1291 // endian-reversing conversions
1292 // ----------------------------------------------------------------------------
1295 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1296 const char *src
, size_t srcLen
) const
1298 srcLen
= GetLength(src
, srcLen
);
1299 if ( srcLen
== wxNO_LEN
)
1300 return wxCONV_FAILED
;
1302 srcLen
/= BYTES_PER_CHAR
;
1306 if ( dstLen
< srcLen
)
1307 return wxCONV_FAILED
;
1309 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1310 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1312 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1320 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1321 const wchar_t *src
, size_t srcLen
) const
1323 if ( srcLen
== wxNO_LEN
)
1324 srcLen
= wxWcslen(src
) + 1;
1326 srcLen
*= BYTES_PER_CHAR
;
1330 if ( dstLen
< srcLen
)
1331 return wxCONV_FAILED
;
1333 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1334 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1336 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1343 #else // !WC_UTF16: wchar_t is UTF-32
1345 // ----------------------------------------------------------------------------
1346 // conversions without endianness change
1347 // ----------------------------------------------------------------------------
1350 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1351 const char *src
, size_t srcLen
) const
1353 srcLen
= GetLength(src
, srcLen
);
1354 if ( srcLen
== wxNO_LEN
)
1355 return wxCONV_FAILED
;
1357 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1360 // optimization: return maximal space which could be needed for this
1361 // string even if the real size could be smaller if the buffer contains
1367 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1368 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1370 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1372 return wxCONV_FAILED
;
1374 if ( ++outLen
> dstLen
)
1375 return wxCONV_FAILED
;
1385 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1386 const wchar_t *src
, size_t srcLen
) const
1388 if ( srcLen
== wxNO_LEN
)
1389 srcLen
= wxWcslen(src
) + 1;
1392 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1393 for ( size_t n
= 0; n
< srcLen
; n
++ )
1396 const size_t numChars
= encode_utf16(*src
++, cc
);
1397 if ( numChars
== wxCONV_FAILED
)
1398 return wxCONV_FAILED
;
1400 outLen
+= numChars
* BYTES_PER_CHAR
;
1403 if ( outLen
> dstLen
)
1404 return wxCONV_FAILED
;
1407 if ( numChars
== 2 )
1409 // second character of a surrogate
1418 // ----------------------------------------------------------------------------
1419 // endian-reversing conversions
1420 // ----------------------------------------------------------------------------
1423 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1424 const char *src
, size_t srcLen
) const
1426 srcLen
= GetLength(src
, srcLen
);
1427 if ( srcLen
== wxNO_LEN
)
1428 return wxCONV_FAILED
;
1430 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1433 // optimization: return maximal space which could be needed for this
1434 // string even if the real size could be smaller if the buffer contains
1440 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1441 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1446 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1448 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1450 const size_t numChars
= decode_utf16(tmp
, ch
);
1451 if ( numChars
== wxCONV_FAILED
)
1452 return wxCONV_FAILED
;
1454 if ( numChars
== 2 )
1457 if ( ++outLen
> dstLen
)
1458 return wxCONV_FAILED
;
1468 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1469 const wchar_t *src
, size_t srcLen
) const
1471 if ( srcLen
== wxNO_LEN
)
1472 srcLen
= wxWcslen(src
) + 1;
1475 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1476 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1479 const size_t numChars
= encode_utf16(*src
, cc
);
1480 if ( numChars
== wxCONV_FAILED
)
1481 return wxCONV_FAILED
;
1483 outLen
+= numChars
* BYTES_PER_CHAR
;
1486 if ( outLen
> dstLen
)
1487 return wxCONV_FAILED
;
1489 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1490 if ( numChars
== 2 )
1492 // second character of a surrogate
1493 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1501 #endif // WC_UTF16/!WC_UTF16
1504 // ============================================================================
1506 // ============================================================================
1508 #ifdef WORDS_BIGENDIAN
1509 #define wxMBConvUTF32straight wxMBConvUTF32BE
1510 #define wxMBConvUTF32swap wxMBConvUTF32LE
1512 #define wxMBConvUTF32swap wxMBConvUTF32BE
1513 #define wxMBConvUTF32straight wxMBConvUTF32LE
1517 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1518 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1521 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1523 if ( srcLen
== wxNO_LEN
)
1525 // count the number of bytes in input, including the trailing NULs
1526 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1527 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1530 srcLen
*= BYTES_PER_CHAR
;
1532 else // we already have the length
1534 // we can only convert an entire number of UTF-32 characters
1535 if ( srcLen
% BYTES_PER_CHAR
)
1536 return wxCONV_FAILED
;
1542 // case when in-memory representation is UTF-16
1545 // ----------------------------------------------------------------------------
1546 // conversions without endianness change
1547 // ----------------------------------------------------------------------------
1550 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1551 const char *src
, size_t srcLen
) const
1553 srcLen
= GetLength(src
, srcLen
);
1554 if ( srcLen
== wxNO_LEN
)
1555 return wxCONV_FAILED
;
1557 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1558 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1560 for ( size_t n
= 0; n
< inLen
; n
++ )
1563 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1564 if ( numChars
== wxCONV_FAILED
)
1565 return wxCONV_FAILED
;
1570 if ( outLen
> dstLen
)
1571 return wxCONV_FAILED
;
1574 if ( numChars
== 2 )
1576 // second character of a surrogate
1586 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1587 const wchar_t *src
, size_t srcLen
) const
1589 if ( srcLen
== wxNO_LEN
)
1590 srcLen
= wxWcslen(src
) + 1;
1594 // optimization: return maximal space which could be needed for this
1595 // string instead of the exact amount which could be less if there are
1596 // any surrogates in the input
1598 // we consider that surrogates are rare enough to make it worthwhile to
1599 // avoid running the loop below at the cost of slightly extra memory
1601 return srcLen
* BYTES_PER_CHAR
;
1604 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1606 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1608 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1610 return wxCONV_FAILED
;
1612 outLen
+= BYTES_PER_CHAR
;
1614 if ( outLen
> dstLen
)
1615 return wxCONV_FAILED
;
1623 // ----------------------------------------------------------------------------
1624 // endian-reversing conversions
1625 // ----------------------------------------------------------------------------
1628 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1629 const char *src
, size_t srcLen
) const
1631 srcLen
= GetLength(src
, srcLen
);
1632 if ( srcLen
== wxNO_LEN
)
1633 return wxCONV_FAILED
;
1635 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1636 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1638 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1641 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1642 if ( numChars
== wxCONV_FAILED
)
1643 return wxCONV_FAILED
;
1648 if ( outLen
> dstLen
)
1649 return wxCONV_FAILED
;
1652 if ( numChars
== 2 )
1654 // second character of a surrogate
1664 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1665 const wchar_t *src
, size_t srcLen
) const
1667 if ( srcLen
== wxNO_LEN
)
1668 srcLen
= wxWcslen(src
) + 1;
1672 // optimization: return maximal space which could be needed for this
1673 // string instead of the exact amount which could be less if there are
1674 // any surrogates in the input
1676 // we consider that surrogates are rare enough to make it worthwhile to
1677 // avoid running the loop below at the cost of slightly extra memory
1679 return srcLen
*BYTES_PER_CHAR
;
1682 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1684 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1686 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1688 return wxCONV_FAILED
;
1690 outLen
+= BYTES_PER_CHAR
;
1692 if ( outLen
> dstLen
)
1693 return wxCONV_FAILED
;
1695 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1701 #else // !WC_UTF16: wchar_t is UTF-32
1703 // ----------------------------------------------------------------------------
1704 // conversions without endianness change
1705 // ----------------------------------------------------------------------------
1708 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1709 const char *src
, size_t srcLen
) const
1711 // use memcpy() as it should be much faster than hand-written loop
1712 srcLen
= GetLength(src
, srcLen
);
1713 if ( srcLen
== wxNO_LEN
)
1714 return wxCONV_FAILED
;
1716 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1719 if ( dstLen
< inLen
)
1720 return wxCONV_FAILED
;
1722 memcpy(dst
, src
, srcLen
);
1729 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1730 const wchar_t *src
, size_t srcLen
) const
1732 if ( srcLen
== wxNO_LEN
)
1733 srcLen
= wxWcslen(src
) + 1;
1735 srcLen
*= BYTES_PER_CHAR
;
1739 if ( dstLen
< srcLen
)
1740 return wxCONV_FAILED
;
1742 memcpy(dst
, src
, srcLen
);
1748 // ----------------------------------------------------------------------------
1749 // endian-reversing conversions
1750 // ----------------------------------------------------------------------------
1753 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1754 const char *src
, size_t srcLen
) const
1756 srcLen
= GetLength(src
, srcLen
);
1757 if ( srcLen
== wxNO_LEN
)
1758 return wxCONV_FAILED
;
1760 srcLen
/= BYTES_PER_CHAR
;
1764 if ( dstLen
< srcLen
)
1765 return wxCONV_FAILED
;
1767 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1768 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1770 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1778 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1779 const wchar_t *src
, size_t srcLen
) const
1781 if ( srcLen
== wxNO_LEN
)
1782 srcLen
= wxWcslen(src
) + 1;
1784 srcLen
*= BYTES_PER_CHAR
;
1788 if ( dstLen
< srcLen
)
1789 return wxCONV_FAILED
;
1791 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1792 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1794 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1801 #endif // WC_UTF16/!WC_UTF16
1804 // ============================================================================
1805 // The classes doing conversion using the iconv_xxx() functions
1806 // ============================================================================
1810 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1811 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1812 // (unless there's yet another bug in glibc) the only case when iconv()
1813 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1814 // left in the input buffer -- when _real_ error occurs,
1815 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1817 // [This bug does not appear in glibc 2.2.]
1818 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1819 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1820 (errno != E2BIG || bufLeft != 0))
1822 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1825 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1827 #define ICONV_T_INVALID ((iconv_t)-1)
1829 #if SIZEOF_WCHAR_T == 4
1830 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1831 #define WC_ENC wxFONTENCODING_UTF32
1832 #elif SIZEOF_WCHAR_T == 2
1833 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1834 #define WC_ENC wxFONTENCODING_UTF16
1835 #else // sizeof(wchar_t) != 2 nor 4
1836 // does this ever happen?
1837 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1840 // ----------------------------------------------------------------------------
1841 // wxMBConv_iconv: encapsulates an iconv character set
1842 // ----------------------------------------------------------------------------
1844 class wxMBConv_iconv
: public wxMBConv
1847 wxMBConv_iconv(const char *name
);
1848 virtual ~wxMBConv_iconv();
1850 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1851 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1853 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1854 virtual size_t GetMBNulLen() const;
1856 #if wxUSE_UNICODE_UTF8
1857 virtual bool IsUTF8() const;
1860 virtual wxMBConv
*Clone() const
1862 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
1863 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1868 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1871 // the iconv handlers used to translate from multibyte
1872 // to wide char and in the other direction
1877 // guards access to m2w and w2m objects
1878 wxMutex m_iconvMutex
;
1882 // the name (for iconv_open()) of a wide char charset -- if none is
1883 // available on this machine, it will remain NULL
1884 static wxString ms_wcCharsetName
;
1886 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1887 // different endian-ness than the native one
1888 static bool ms_wcNeedsSwap
;
1891 // name of the encoding handled by this conversion
1894 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1896 size_t m_minMBCharWidth
;
1899 // make the constructor available for unit testing
1900 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
1902 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1903 if ( !result
->IsOk() )
1912 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1913 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1915 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
1918 m_minMBCharWidth
= 0;
1920 // check for charset that represents wchar_t:
1921 if ( ms_wcCharsetName
.empty() )
1923 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1926 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1927 #else // !wxUSE_FONTMAP
1928 static const wxChar
*names_static
[] =
1930 #if SIZEOF_WCHAR_T == 4
1932 #elif SIZEOF_WCHAR_T = 2
1937 const wxChar
**names
= names_static
;
1938 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1940 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1942 const wxString
nameCS(*names
);
1944 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1945 wxString
nameXE(nameCS
);
1947 #ifdef WORDS_BIGENDIAN
1949 #else // little endian
1953 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1956 m2w
= iconv_open(nameXE
.ToAscii(), name
);
1957 if ( m2w
== ICONV_T_INVALID
)
1959 // try charset w/o bytesex info (e.g. "UCS4")
1960 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1962 m2w
= iconv_open(nameCS
.ToAscii(), name
);
1964 // and check for bytesex ourselves:
1965 if ( m2w
!= ICONV_T_INVALID
)
1967 char buf
[2], *bufPtr
;
1968 wchar_t wbuf
[2], *wbufPtr
;
1976 outsz
= SIZEOF_WCHAR_T
* 2;
1981 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1982 (char**)&wbufPtr
, &outsz
);
1984 if (ICONV_FAILED(res
, insz
))
1986 wxLogLastError(wxT("iconv"));
1987 wxLogError(_("Conversion to charset '%s' doesn't work."),
1990 else // ok, can convert to this encoding, remember it
1992 ms_wcCharsetName
= nameCS
;
1993 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1997 else // use charset not requiring byte swapping
1999 ms_wcCharsetName
= nameXE
;
2003 wxLogTrace(TRACE_STRCONV
,
2004 wxT("iconv wchar_t charset is \"%s\"%s"),
2005 ms_wcCharsetName
.empty() ? wxString("<none>")
2007 ms_wcNeedsSwap
? _T(" (needs swap)")
2010 else // we already have ms_wcCharsetName
2012 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2015 if ( ms_wcCharsetName
.empty() )
2017 w2m
= ICONV_T_INVALID
;
2021 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2022 if ( w2m
== ICONV_T_INVALID
)
2024 wxLogTrace(TRACE_STRCONV
,
2025 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2026 ms_wcCharsetName
.c_str(), name
);
2031 wxMBConv_iconv::~wxMBConv_iconv()
2033 if ( m2w
!= ICONV_T_INVALID
)
2035 if ( w2m
!= ICONV_T_INVALID
)
2039 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2041 // find the string length: notice that must be done differently for
2042 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2044 const size_t nulLen
= GetMBNulLen();
2048 return wxCONV_FAILED
;
2051 inbuf
= strlen(psz
); // arguably more optimized than our version
2056 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2057 // they also have to start at character boundary and not span two
2058 // adjacent characters
2060 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2067 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2068 // Unfortunately there are a couple of global wxCSConv objects such as
2069 // wxConvLocal that are used all over wx code, so we have to make sure
2070 // the handle is used by at most one thread at the time. Otherwise
2071 // only a few wx classes would be safe to use from non-main threads
2072 // as MB<->WC conversion would fail "randomly".
2073 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2074 #endif // wxUSE_THREADS
2076 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
2078 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2079 wchar_t *bufPtr
= buf
;
2080 const char *pszPtr
= psz
;
2084 // have destination buffer, convert there
2086 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
2087 (char**)&bufPtr
, &outbuf
);
2088 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
2092 // convert to native endianness
2093 for ( unsigned i
= 0; i
< res
; i
++ )
2094 buf
[n
] = WC_BSWAP(buf
[i
]);
2097 // NUL-terminate the string if there is any space left
2103 // no destination buffer... convert using temp buffer
2104 // to calculate destination buffer requirement
2111 outbuf
= 8 * SIZEOF_WCHAR_T
;
2114 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
2115 (char**)&bufPtr
, &outbuf
);
2117 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
2119 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2122 if (ICONV_FAILED(cres
, inbuf
))
2124 //VS: it is ok if iconv fails, hence trace only
2125 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2126 return wxCONV_FAILED
;
2132 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2135 // NB: explained in MB2WC
2136 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2139 size_t inlen
= wxWcslen(psz
);
2140 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
2144 wchar_t *tmpbuf
= 0;
2148 // need to copy to temp buffer to switch endianness
2149 // (doing WC_BSWAP twice on the original buffer won't help, as it
2150 // could be in read-only memory, or be accessed in some other thread)
2151 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
2152 for ( size_t i
= 0; i
< inlen
; i
++ )
2153 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
2155 tmpbuf
[inlen
] = L
'\0';
2161 // have destination buffer, convert there
2162 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
2166 // NB: iconv was given only wcslen(psz) characters on input, and so
2167 // it couldn't convert the trailing zero. Let's do it ourselves
2168 // if there's some room left for it in the output buffer.
2174 // no destination buffer: convert using temp buffer
2175 // to calculate destination buffer requirement
2183 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
2187 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2195 if (ICONV_FAILED(cres
, inbuf
))
2197 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2198 return wxCONV_FAILED
;
2204 size_t wxMBConv_iconv::GetMBNulLen() const
2206 if ( m_minMBCharWidth
== 0 )
2208 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2211 // NB: explained in MB2WC
2212 wxMutexLocker
lock(self
->m_iconvMutex
);
2215 const wchar_t *wnul
= L
"";
2216 char buf
[8]; // should be enough for NUL in any encoding
2217 size_t inLen
= sizeof(wchar_t),
2218 outLen
= WXSIZEOF(buf
);
2219 char *inBuff
= (char *)wnul
;
2220 char *outBuff
= buf
;
2221 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2223 self
->m_minMBCharWidth
= (size_t)-1;
2227 self
->m_minMBCharWidth
= outBuff
- buf
;
2231 return m_minMBCharWidth
;
2234 #if wxUSE_UNICODE_UTF8
2235 bool wxMBConv_iconv::IsUTF8() const
2237 return wxStricmp(m_name
, "UTF-8") == 0 ||
2238 wxStricmp(m_name
, "UTF8") == 0;
2242 #endif // HAVE_ICONV
2245 // ============================================================================
2246 // Win32 conversion classes
2247 // ============================================================================
2249 #ifdef wxHAVE_WIN32_MB2WC
2253 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2254 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2257 class wxMBConv_win32
: public wxMBConv
2262 m_CodePage
= CP_ACP
;
2263 m_minMBCharWidth
= 0;
2266 wxMBConv_win32(const wxMBConv_win32
& conv
)
2269 m_CodePage
= conv
.m_CodePage
;
2270 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2274 wxMBConv_win32(const char* name
)
2276 m_CodePage
= wxCharsetToCodepage(name
);
2277 m_minMBCharWidth
= 0;
2280 wxMBConv_win32(wxFontEncoding encoding
)
2282 m_CodePage
= wxEncodingToCodepage(encoding
);
2283 m_minMBCharWidth
= 0;
2285 #endif // wxUSE_FONTMAP
2287 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2289 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2290 // the behaviour is not compatible with the Unix version (using iconv)
2291 // and break the library itself, e.g. wxTextInputStream::NextChar()
2292 // wouldn't work if reading an incomplete MB char didn't result in an
2295 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2296 // Win XP or newer and it is not supported for UTF-[78] so we always
2297 // use our own conversions in this case. See
2298 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2299 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2300 if ( m_CodePage
== CP_UTF8
)
2302 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2305 if ( m_CodePage
== CP_UTF7
)
2307 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2311 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2312 IsAtLeastWin2kSP4() )
2314 flags
= MB_ERR_INVALID_CHARS
;
2317 const size_t len
= ::MultiByteToWideChar
2319 m_CodePage
, // code page
2320 flags
, // flags: fall on error
2321 psz
, // input string
2322 -1, // its length (NUL-terminated)
2323 buf
, // output string
2324 buf
? n
: 0 // size of output buffer
2328 // function totally failed
2329 return wxCONV_FAILED
;
2332 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2333 // check if we succeeded, by doing a double trip:
2334 if ( !flags
&& buf
)
2336 const size_t mbLen
= strlen(psz
);
2337 wxCharBuffer
mbBuf(mbLen
);
2338 if ( ::WideCharToMultiByte
2345 mbLen
+ 1, // size in bytes, not length
2349 strcmp(mbBuf
, psz
) != 0 )
2351 // we didn't obtain the same thing we started from, hence
2352 // the conversion was lossy and we consider that it failed
2353 return wxCONV_FAILED
;
2357 // note that it returns count of written chars for buf != NULL and size
2358 // of the needed buffer for buf == NULL so in either case the length of
2359 // the string (which never includes the terminating NUL) is one less
2363 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2366 we have a problem here: by default, WideCharToMultiByte() may
2367 replace characters unrepresentable in the target code page with bad
2368 quality approximations such as turning "1/2" symbol (U+00BD) into
2369 "1" for the code pages which don't have it and we, obviously, want
2370 to avoid this at any price
2372 the trouble is that this function does it _silently_, i.e. it won't
2373 even tell us whether it did or not... Win98/2000 and higher provide
2374 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2375 we have to resort to a round trip, i.e. check that converting back
2376 results in the same string -- this is, of course, expensive but
2377 otherwise we simply can't be sure to not garble the data.
2380 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2381 // it doesn't work with CJK encodings (which we test for rather roughly
2382 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2384 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2387 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2389 // it's our lucky day
2390 flags
= WC_NO_BEST_FIT_CHARS
;
2391 pUsedDef
= &usedDef
;
2393 else // old system or unsupported encoding
2399 const size_t len
= ::WideCharToMultiByte
2401 m_CodePage
, // code page
2402 flags
, // either none or no best fit
2403 pwz
, // input string
2404 -1, // it is (wide) NUL-terminated
2405 buf
, // output buffer
2406 buf
? n
: 0, // and its size
2407 NULL
, // default "replacement" char
2408 pUsedDef
// [out] was it used?
2413 // function totally failed
2414 return wxCONV_FAILED
;
2417 // if we were really converting, check if we succeeded
2422 // check if the conversion failed, i.e. if any replacements
2425 return wxCONV_FAILED
;
2427 else // we must resort to double tripping...
2429 wxWCharBuffer
wcBuf(n
);
2430 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2431 wcscmp(wcBuf
, pwz
) != 0 )
2433 // we didn't obtain the same thing we started from, hence
2434 // the conversion was lossy and we consider that it failed
2435 return wxCONV_FAILED
;
2440 // see the comment above for the reason of "len - 1"
2444 virtual size_t GetMBNulLen() const
2446 if ( m_minMBCharWidth
== 0 )
2448 int len
= ::WideCharToMultiByte
2450 m_CodePage
, // code page
2452 L
"", // input string
2453 1, // translate just the NUL
2454 NULL
, // output buffer
2456 NULL
, // no replacement char
2457 NULL
// [out] don't care if it was used
2460 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2464 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2465 self
->m_minMBCharWidth
= (size_t)-1;
2469 self
->m_minMBCharWidth
= (size_t)-1;
2475 self
->m_minMBCharWidth
= len
;
2480 return m_minMBCharWidth
;
2483 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2485 bool IsOk() const { return m_CodePage
!= -1; }
2488 static bool CanUseNoBestFit()
2490 static int s_isWin98Or2k
= -1;
2492 if ( s_isWin98Or2k
== -1 )
2495 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2497 case wxOS_WINDOWS_9X
:
2498 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2501 case wxOS_WINDOWS_NT
:
2502 s_isWin98Or2k
= verMaj
>= 5;
2506 // unknown: be conservative by default
2511 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2514 return s_isWin98Or2k
== 1;
2517 static bool IsAtLeastWin2kSP4()
2522 static int s_isAtLeastWin2kSP4
= -1;
2524 if ( s_isAtLeastWin2kSP4
== -1 )
2526 OSVERSIONINFOEX ver
;
2528 memset(&ver
, 0, sizeof(ver
));
2529 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2530 GetVersionEx((OSVERSIONINFO
*)&ver
);
2532 s_isAtLeastWin2kSP4
=
2533 ((ver
.dwMajorVersion
> 5) || // Vista+
2534 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2535 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2536 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2540 return s_isAtLeastWin2kSP4
== 1;
2545 // the code page we're working with
2548 // cached result of GetMBNulLen(), set to 0 initially meaning
2550 size_t m_minMBCharWidth
;
2553 #endif // wxHAVE_WIN32_MB2WC
2556 // ============================================================================
2557 // wxEncodingConverter based conversion classes
2558 // ============================================================================
2562 class wxMBConv_wxwin
: public wxMBConv
2567 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2568 // The wxMBConv_cf class does a better job.
2569 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2570 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2571 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2575 // temporarily just use wxEncodingConverter stuff,
2576 // so that it works while a better implementation is built
2577 wxMBConv_wxwin(const char* name
)
2580 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2582 m_enc
= wxFONTENCODING_SYSTEM
;
2587 wxMBConv_wxwin(wxFontEncoding enc
)
2594 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2596 size_t inbuf
= strlen(psz
);
2599 if (!m2w
.Convert(psz
, buf
))
2600 return wxCONV_FAILED
;
2605 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2607 const size_t inbuf
= wxWcslen(psz
);
2610 if (!w2m
.Convert(psz
, buf
))
2611 return wxCONV_FAILED
;
2617 virtual size_t GetMBNulLen() const
2621 case wxFONTENCODING_UTF16BE
:
2622 case wxFONTENCODING_UTF16LE
:
2625 case wxFONTENCODING_UTF32BE
:
2626 case wxFONTENCODING_UTF32LE
:
2634 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2636 bool IsOk() const { return m_ok
; }
2639 wxFontEncoding m_enc
;
2640 wxEncodingConverter m2w
, w2m
;
2643 // were we initialized successfully?
2646 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2649 // make the constructors available for unit testing
2650 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2652 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2653 if ( !result
->IsOk() )
2662 #endif // wxUSE_FONTMAP
2664 // ============================================================================
2665 // wxCSConv implementation
2666 // ============================================================================
2668 void wxCSConv::Init()
2675 wxCSConv::wxCSConv(const wxString
& charset
)
2679 if ( !charset
.empty() )
2681 SetName(charset
.ToAscii());
2685 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2687 m_encoding
= wxFONTENCODING_SYSTEM
;
2691 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2693 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2695 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2697 encoding
= wxFONTENCODING_SYSTEM
;
2702 m_encoding
= encoding
;
2705 wxCSConv::~wxCSConv()
2710 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2715 SetName(conv
.m_name
);
2716 m_encoding
= conv
.m_encoding
;
2719 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2723 SetName(conv
.m_name
);
2724 m_encoding
= conv
.m_encoding
;
2729 void wxCSConv::Clear()
2738 void wxCSConv::SetName(const char *charset
)
2742 m_name
= strdup(charset
);
2749 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2750 wxEncodingNameCache
);
2752 static wxEncodingNameCache gs_nameCache
;
2755 wxMBConv
*wxCSConv::DoCreate() const
2758 wxLogTrace(TRACE_STRCONV
,
2759 wxT("creating conversion for %s"),
2761 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2762 #endif // wxUSE_FONTMAP
2764 // check for the special case of ASCII or ISO8859-1 charset: as we have
2765 // special knowledge of it anyhow, we don't need to create a special
2766 // conversion object
2767 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2768 m_encoding
== wxFONTENCODING_DEFAULT
)
2770 // don't convert at all
2774 // we trust OS to do conversion better than we can so try external
2775 // conversion methods first
2777 // the full order is:
2778 // 1. OS conversion (iconv() under Unix or Win32 API)
2779 // 2. hard coded conversions for UTF
2780 // 3. wxEncodingConverter as fall back
2786 #endif // !wxUSE_FONTMAP
2789 wxFontEncoding
encoding(m_encoding
);
2794 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2802 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2803 #endif // wxUSE_FONTMAP
2807 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2808 if ( it
!= gs_nameCache
.end() )
2810 if ( it
->second
.empty() )
2813 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
2820 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2821 // CS : in case this does not return valid names (eg for MacRoman)
2822 // encoding got a 'failure' entry in the cache all the same,
2823 // although it just has to be created using a different method, so
2824 // only store failed iconv creation attempts (or perhaps we
2825 // shoulnd't do this at all ?)
2826 if ( names
[0] != NULL
)
2828 for ( ; *names
; ++names
)
2830 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2831 // will need changes that will obsolete this
2832 wxString
name(*names
);
2833 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
2836 gs_nameCache
[encoding
] = *names
;
2843 gs_nameCache
[encoding
] = _T(""); // cache the failure
2846 #endif // wxUSE_FONTMAP
2848 #endif // HAVE_ICONV
2850 #ifdef wxHAVE_WIN32_MB2WC
2853 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2854 : new wxMBConv_win32(m_encoding
);
2863 #endif // wxHAVE_WIN32_MB2WC
2867 // leave UTF16 and UTF32 to the built-ins of wx
2868 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2869 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2872 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
2873 : new wxMBConv_cf(m_encoding
);
2875 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
2884 #endif // __DARWIN__
2887 wxFontEncoding enc
= m_encoding
;
2889 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2891 // use "false" to suppress interactive dialogs -- we can be called from
2892 // anywhere and popping up a dialog from here is the last thing we want to
2894 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2896 #endif // wxUSE_FONTMAP
2900 case wxFONTENCODING_UTF7
:
2901 return new wxMBConvUTF7
;
2903 case wxFONTENCODING_UTF8
:
2904 return new wxMBConvUTF8
;
2906 case wxFONTENCODING_UTF16BE
:
2907 return new wxMBConvUTF16BE
;
2909 case wxFONTENCODING_UTF16LE
:
2910 return new wxMBConvUTF16LE
;
2912 case wxFONTENCODING_UTF32BE
:
2913 return new wxMBConvUTF32BE
;
2915 case wxFONTENCODING_UTF32LE
:
2916 return new wxMBConvUTF32LE
;
2919 // nothing to do but put here to suppress gcc warnings
2926 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2927 : new wxMBConv_wxwin(m_encoding
);
2933 #endif // wxUSE_FONTMAP
2935 // NB: This is a hack to prevent deadlock. What could otherwise happen
2936 // in Unicode build: wxConvLocal creation ends up being here
2937 // because of some failure and logs the error. But wxLog will try to
2938 // attach a timestamp, for which it will need wxConvLocal (to convert
2939 // time to char* and then wchar_t*), but that fails, tries to log the
2940 // error, but wxLog has an (already locked) critical section that
2941 // guards the static buffer.
2942 static bool alreadyLoggingError
= false;
2943 if (!alreadyLoggingError
)
2945 alreadyLoggingError
= true;
2946 wxLogError(_("Cannot convert from the charset '%s'!"),
2950 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
2951 #else // !wxUSE_FONTMAP
2952 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
2953 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2956 alreadyLoggingError
= false;
2962 void wxCSConv::CreateConvIfNeeded() const
2966 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
2968 // if we don't have neither the name nor the encoding, use the default
2969 // encoding for this system
2970 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
2973 self
->m_encoding
= wxLocale::GetSystemEncoding();
2975 // fallback to some reasonable default:
2976 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
2977 #endif // wxUSE_INTL
2980 self
->m_convReal
= DoCreate();
2981 self
->m_deferred
= false;
2985 bool wxCSConv::IsOk() const
2987 CreateConvIfNeeded();
2989 // special case: no convReal created for wxFONTENCODING_ISO8859_1
2990 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
2991 return true; // always ok as we do it ourselves
2993 // m_convReal->IsOk() is called at its own creation, so we know it must
2994 // be ok if m_convReal is non-NULL
2995 return m_convReal
!= NULL
;
2998 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
2999 const char *src
, size_t srcLen
) const
3001 CreateConvIfNeeded();
3004 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3007 return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
);
3010 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3011 const wchar_t *src
, size_t srcLen
) const
3013 CreateConvIfNeeded();
3016 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3019 return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
);
3022 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3024 CreateConvIfNeeded();
3027 return m_convReal
->MB2WC(buf
, psz
, n
);
3030 size_t len
= strlen(psz
);
3034 for (size_t c
= 0; c
<= len
; c
++)
3035 buf
[c
] = (unsigned char)(psz
[c
]);
3041 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3043 CreateConvIfNeeded();
3046 return m_convReal
->WC2MB(buf
, psz
, n
);
3049 const size_t len
= wxWcslen(psz
);
3052 for (size_t c
= 0; c
<= len
; c
++)
3055 return wxCONV_FAILED
;
3057 buf
[c
] = (char)psz
[c
];
3062 for (size_t c
= 0; c
<= len
; c
++)
3065 return wxCONV_FAILED
;
3072 size_t wxCSConv::GetMBNulLen() const
3074 CreateConvIfNeeded();
3078 return m_convReal
->GetMBNulLen();
3081 // otherwise, we are ISO-8859-1
3085 #if wxUSE_UNICODE_UTF8
3086 bool wxCSConv::IsUTF8() const
3088 CreateConvIfNeeded();
3092 return m_convReal
->IsUTF8();
3095 // otherwise, we are ISO-8859-1
3103 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3106 return wxWCharBuffer();
3108 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3110 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3112 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3117 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3120 return wxCharBuffer();
3122 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3124 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3129 #endif // wxUSE_UNICODE
3131 // ----------------------------------------------------------------------------
3133 // ----------------------------------------------------------------------------
3135 // NB: The reason why we create converted objects in this convoluted way,
3136 // using a factory function instead of global variable, is that they
3137 // may be used at static initialization time (some of them are used by
3138 // wxString ctors and there may be a global wxString object). In other
3139 // words, possibly _before_ the converter global object would be
3146 #undef wxConvISO8859_1
3148 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3149 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3150 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3152 static impl_klass name##Obj ctor_args; \
3153 return &name##Obj; \
3155 /* this ensures that all global converter objects are created */ \
3156 /* by the time static initialization is done, i.e. before any */ \
3157 /* thread is launched: */ \
3158 static klass* gs_##name##instance = wxGet_##name##Ptr()
3160 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3161 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3164 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3166 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3169 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, wxEMPTY_PARAMETER_VALUE
);
3170 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, wxEMPTY_PARAMETER_VALUE
);
3172 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3173 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3175 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3176 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3179 // The xnu kernel always communicates file paths in decomposed UTF-8.
3180 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3181 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3184 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3187 #else // !__DARWIN__
3188 wxGet_wxConvLibcPtr();
3189 #endif // __DARWIN__/!__DARWIN__
3191 #else // !wxUSE_WCHAR_T
3193 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3194 // stand-ins in absence of wchar_t
3195 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3200 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T