1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
53 #include "wx/thread.h"
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
64 #define TRACE_STRCONV _T("strconv")
66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
68 #if SIZEOF_WCHAR_T == 2
73 // ============================================================================
75 // ============================================================================
77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
78 static bool NotAllNULs(const char *p
, size_t n
)
80 while ( n
&& *p
++ == '\0' )
86 // ----------------------------------------------------------------------------
87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
88 // ----------------------------------------------------------------------------
90 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
95 *output
= (wxUint16
) input
;
99 else if (input
>= 0x110000)
101 return wxCONV_FAILED
;
107 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
108 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
115 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
117 if ((*input
< 0xd800) || (*input
> 0xdfff))
122 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
125 return wxCONV_FAILED
;
129 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
135 typedef wchar_t wxDecodeSurrogate_t
;
137 typedef wxUint16 wxDecodeSurrogate_t
;
138 #endif // WC_UTF16/!WC_UTF16
140 // returns the next UTF-32 character from the wchar_t buffer and advances the
141 // pointer to the character after this one
143 // if an invalid character is found, *pSrc is set to NULL, the caller must
145 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
149 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
150 if ( n
== wxCONV_FAILED
)
158 // ----------------------------------------------------------------------------
160 // ----------------------------------------------------------------------------
163 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
164 const char *src
, size_t srcLen
) const
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten
= 0;
176 // the number of NULs terminating this string
177 size_t nulLen
= 0; // not really needed, but just to avoid warnings
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
185 if ( srcLen
!= wxNO_LEN
)
187 // we need to know how to find the end of this string
188 nulLen
= GetMBNulLen();
189 if ( nulLen
== wxCONV_FAILED
)
190 return wxCONV_FAILED
;
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
197 char * const p
= bufTmp
.data();
198 memcpy(p
, src
, srcLen
);
199 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
205 srcEnd
= src
+ srcLen
;
207 else // quit after the first loop iteration
214 // try to convert the current chunk
215 size_t lenChunk
= MB2WC(NULL
, src
, 0);
216 if ( lenChunk
== wxCONV_FAILED
)
217 return wxCONV_FAILED
;
219 lenChunk
++; // for the L'\0' at the end of this chunk
221 dstWritten
+= lenChunk
;
225 // nothing left in the input string, conversion succeeded
231 if ( dstWritten
> dstLen
)
232 return wxCONV_FAILED
;
234 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
235 return wxCONV_FAILED
;
242 // we convert just one chunk in this case as this is the entire
247 // advance the input pointer past the end of this chunk
248 while ( NotAllNULs(src
, nulLen
) )
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
257 src
+= nulLen
; // skipping over its terminator as well
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
270 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
271 const wchar_t *src
, size_t srcLen
) const
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten
= 0;
276 // make a copy of the input string unless it is already properly
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp
;
282 if ( srcLen
== wxNO_LEN
)
284 srcLen
= wxWcslen(src
) + 1;
286 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
288 // make a copy in order to properly NUL-terminate the string
289 bufTmp
= wxWCharBuffer(srcLen
);
290 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
294 const size_t lenNul
= GetMBNulLen();
295 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
297 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
299 // try to convert the current chunk
300 size_t lenChunk
= WC2MB(NULL
, src
, 0);
302 if ( lenChunk
== wxCONV_FAILED
)
303 return wxCONV_FAILED
;
306 dstWritten
+= lenChunk
;
310 if ( dstWritten
> dstLen
)
311 return wxCONV_FAILED
;
313 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
314 return wxCONV_FAILED
;
323 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
325 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
326 if ( rc
!= wxCONV_FAILED
)
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
336 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
338 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
339 if ( rc
!= wxCONV_FAILED
)
347 wxMBConv::~wxMBConv()
349 // nothing to do here (necessary for Darwin linking probably)
352 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
356 // calculate the length of the buffer needed first
357 const size_t nLen
= ToWChar(NULL
, 0, psz
);
358 if ( nLen
!= wxCONV_FAILED
)
360 // now do the actual conversion
361 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
363 // +1 for the trailing NULL
364 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
369 return wxWCharBuffer();
372 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
376 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
377 if ( nLen
!= wxCONV_FAILED
)
379 wxCharBuffer
buf(nLen
- 1);
380 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
385 return wxCharBuffer();
389 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
391 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
392 if ( dstLen
!= wxCONV_FAILED
)
394 wxWCharBuffer
wbuf(dstLen
- 1);
395 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
400 if ( wbuf
[dstLen
- 1] == L
'\0' )
411 return wxWCharBuffer();
415 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
417 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
418 if ( dstLen
!= wxCONV_FAILED
)
420 // special case of empty input: can't allocate 0 size buffer below as
421 // wxCharBuffer insists on NUL-terminating it
422 wxCharBuffer
buf(dstLen
? dstLen
- 1 : 1);
423 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
429 const size_t nulLen
= GetMBNulLen();
430 if ( dstLen
>= nulLen
&&
431 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
433 // in this case the output is NUL-terminated and we're not
434 // supposed to count NUL
446 return wxCharBuffer();
449 // ----------------------------------------------------------------------------
451 // ----------------------------------------------------------------------------
453 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
455 return wxMB2WC(buf
, psz
, n
);
458 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
460 return wxWC2MB(buf
, psz
, n
);
463 // ----------------------------------------------------------------------------
464 // wxConvBrokenFileNames
465 // ----------------------------------------------------------------------------
469 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
471 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
472 wxStricmp(charset
, _T("UTF8")) == 0 )
473 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
475 m_conv
= new wxCSConv(charset
);
480 // ----------------------------------------------------------------------------
482 // ----------------------------------------------------------------------------
484 // Implementation (C) 2004 Fredrik Roubert
487 // BASE64 decoding table
489 static const unsigned char utf7unb64
[] =
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
497 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
498 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
500 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
501 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
502 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
504 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
505 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
506 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
525 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
529 while ( *psz
&& (!buf
|| (len
< n
)) )
531 unsigned char cc
= *psz
++;
539 else if (*psz
== '-')
547 else // start of BASE64 encoded string
551 for ( ok
= lsb
= false, d
= 0, l
= 0;
552 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
557 for (l
+= 6; l
>= 8; lsb
= !lsb
)
559 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
569 *buf
= (wchar_t)(c
<< 8);
578 // in valid UTF7 we should have valid characters after '+'
579 return wxCONV_FAILED
;
587 if ( buf
&& (len
< n
) )
594 // BASE64 encoding table
596 static const unsigned char utf7enb64
[] =
598 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
599 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
600 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
601 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
602 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
603 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
604 'w', 'x', 'y', 'z', '0', '1', '2', '3',
605 '4', '5', '6', '7', '8', '9', '+', '/'
609 // UTF-7 encoding table
611 // 0 - Set D (directly encoded characters)
612 // 1 - Set O (optional direct characters)
613 // 2 - whitespace characters (optional)
614 // 3 - special characters
616 static const unsigned char utf7encode
[128] =
618 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
619 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
620 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
622 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
624 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
628 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
632 while (*psz
&& ((!buf
) || (len
< n
)))
635 if (cc
< 0x80 && utf7encode
[cc
] < 1)
644 else if (((wxUint32
)cc
) > 0xffff)
646 // no surrogate pair generation (yet?)
647 return wxCONV_FAILED
;
658 // BASE64 encode string
659 unsigned int lsb
, d
, l
;
660 for (d
= 0, l
= 0; /*nothing*/; psz
++)
662 for (lsb
= 0; lsb
< 2; lsb
++)
665 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
667 for (l
+= 8; l
>= 6; )
671 *buf
++ = utf7enb64
[(d
>> l
) % 64];
677 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
684 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
696 if (buf
&& (len
< n
))
702 // ----------------------------------------------------------------------------
704 // ----------------------------------------------------------------------------
706 static const wxUint32 utf8_max
[]=
707 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
709 // boundaries of the private use area we use to (temporarily) remap invalid
710 // characters invalid in a UTF-8 encoded string
711 const wxUint32 wxUnicodePUA
= 0x100000;
712 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
714 // this table gives the length of the UTF-8 encoding from its first character:
715 const unsigned char tableUtf8Lengths
[256] = {
716 // single-byte sequences (ASCII):
717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
726 // these are invalid:
727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
733 // two-byte sequences:
734 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
735 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
737 // three-byte sequences:
738 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
740 // four-byte sequences:
741 4, 4, 4, 4, 4, // F0..F4
743 // these are invalid again (5- or 6-byte
744 // sequences and sequences for code points
745 // above U+10FFFF, as restricted by RFC 3629):
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
750 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
751 const char *src
, size_t srcLen
) const
753 wchar_t *out
= dstLen
? dst
: NULL
;
756 if ( srcLen
== wxNO_LEN
)
757 srcLen
= strlen(src
) + 1;
759 for ( const char *p
= src
; ; p
++ )
761 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
763 // all done successfully, just add the trailing NULL if we are not
764 // using explicit length
765 if ( srcLen
== wxNO_LEN
)
781 if ( out
&& !dstLen
-- )
785 unsigned char c
= *p
;
789 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
792 if ( srcLen
!= wxNO_LEN
)
799 unsigned len
= tableUtf8Lengths
[c
];
803 if ( srcLen
< len
) // the test works for wxNO_LEN too
806 if ( srcLen
!= wxNO_LEN
)
809 // Char. number range | UTF-8 octet sequence
810 // (hexadecimal) | (binary)
811 // ----------------------+----------------------------------------
812 // 0000 0000 - 0000 007F | 0xxxxxxx
813 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
814 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
815 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
817 // Code point value is stored in bits marked with 'x',
818 // lowest-order bit of the value on the right side in the diagram
819 // above. (from RFC 3629)
821 // mask to extract lead byte's value ('x' bits above), by sequence
823 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
825 // mask and value of lead byte's most significant bits, by length:
826 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
827 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
829 len
--; // it's more convenient to work with 0-based length here
831 // extract the lead byte's value bits:
832 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
835 code
= c
& leadValueMask
[len
];
837 // all remaining bytes, if any, are handled in the same way
838 // regardless of sequence's length:
842 if ( (c
& 0xC0) != 0x80 )
843 return wxCONV_FAILED
;
851 // cast is ok because wchar_t == wxUint16 if WC_UTF16
852 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
861 #endif // WC_UTF16/!WC_UTF16
869 return wxCONV_FAILED
;
873 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
874 const wchar_t *src
, size_t srcLen
) const
876 char *out
= dstLen
? dst
: NULL
;
879 for ( const wchar_t *wp
= src
; ; wp
++ )
881 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
--) )
883 // all done successfully, just add the trailing NULL if we are not
884 // using explicit length
885 if ( srcLen
== wxNO_LEN
)
904 // cast is ok for WC_UTF16
905 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
907 // skip the next char too as we decoded a surrogate
910 #else // wchar_t is UTF-32
911 code
= *wp
& 0x7fffffff;
926 else if ( code
<= 0x07FF )
934 // NB: this line takes 6 least significant bits, encodes them as
935 // 10xxxxxx and discards them so that the next byte can be encoded:
936 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
937 out
[0] = 0xC0 | code
;
940 else if ( code
< 0xFFFF )
948 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
949 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
950 out
[0] = 0xE0 | code
;
953 else if ( code
<= 0x10FFFF )
961 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
962 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
963 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
964 out
[0] = 0xF0 | code
;
969 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
982 // we only get here if an error occurs during decoding
983 return wxCONV_FAILED
;
986 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
988 if ( m_options
== MAP_INVALID_UTF8_NOT
)
989 return wxMBConvStrictUTF8::MB2WC(buf
, psz
, n
);
993 while (*psz
&& ((!buf
) || (len
< n
)))
995 const char *opsz
= psz
;
996 bool invalid
= false;
997 unsigned char cc
= *psz
++, fc
= cc
;
999 for (cnt
= 0; fc
& 0x80; cnt
++)
1009 // escape the escape character for octal escapes
1010 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1011 && cc
== '\\' && (!buf
|| len
< n
))
1023 // invalid UTF-8 sequence
1028 unsigned ocnt
= cnt
- 1;
1029 wxUint32 res
= cc
& (0x3f >> cnt
);
1033 if ((cc
& 0xC0) != 0x80)
1035 // invalid UTF-8 sequence
1041 res
= (res
<< 6) | (cc
& 0x3f);
1044 if (invalid
|| res
<= utf8_max
[ocnt
])
1046 // illegal UTF-8 encoding
1049 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1050 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1052 // if one of our PUA characters turns up externally
1053 // it must also be treated as an illegal sequence
1054 // (a bit like you have to escape an escape character)
1060 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1061 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1062 if (pa
== wxCONV_FAILED
)
1074 *buf
++ = (wchar_t)res
;
1076 #endif // WC_UTF16/!WC_UTF16
1082 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1084 while (opsz
< psz
&& (!buf
|| len
< n
))
1087 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1088 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1089 wxASSERT(pa
!= wxCONV_FAILED
);
1096 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1102 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1104 while (opsz
< psz
&& (!buf
|| len
< n
))
1106 if ( buf
&& len
+ 3 < n
)
1108 unsigned char on
= *opsz
;
1110 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1111 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1112 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1119 else // MAP_INVALID_UTF8_NOT
1121 return wxCONV_FAILED
;
1127 if (buf
&& (len
< n
))
1133 static inline bool isoctal(wchar_t wch
)
1135 return L
'0' <= wch
&& wch
<= L
'7';
1138 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1140 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1141 return wxMBConvStrictUTF8::WC2MB(buf
, psz
, n
);
1145 while (*psz
&& ((!buf
) || (len
< n
)))
1150 // cast is ok for WC_UTF16
1151 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1152 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1154 cc
= (*psz
++) & 0x7fffffff;
1157 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1158 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1161 *buf
++ = (char)(cc
- wxUnicodePUA
);
1164 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1165 && cc
== L
'\\' && psz
[0] == L
'\\' )
1172 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1174 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1178 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1179 (psz
[1] - L
'0') * 010 +
1189 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1205 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1207 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1213 if (buf
&& (len
< n
))
1219 // ============================================================================
1221 // ============================================================================
1223 #ifdef WORDS_BIGENDIAN
1224 #define wxMBConvUTF16straight wxMBConvUTF16BE
1225 #define wxMBConvUTF16swap wxMBConvUTF16LE
1227 #define wxMBConvUTF16swap wxMBConvUTF16BE
1228 #define wxMBConvUTF16straight wxMBConvUTF16LE
1232 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1234 if ( srcLen
== wxNO_LEN
)
1236 // count the number of bytes in input, including the trailing NULs
1237 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1238 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1241 srcLen
*= BYTES_PER_CHAR
;
1243 else // we already have the length
1245 // we can only convert an entire number of UTF-16 characters
1246 if ( srcLen
% BYTES_PER_CHAR
)
1247 return wxCONV_FAILED
;
1253 // case when in-memory representation is UTF-16 too
1256 // ----------------------------------------------------------------------------
1257 // conversions without endianness change
1258 // ----------------------------------------------------------------------------
1261 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1262 const char *src
, size_t srcLen
) const
1264 // set up the scene for using memcpy() (which is presumably more efficient
1265 // than copying the bytes one by one)
1266 srcLen
= GetLength(src
, srcLen
);
1267 if ( srcLen
== wxNO_LEN
)
1268 return wxCONV_FAILED
;
1270 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1273 if ( dstLen
< inLen
)
1274 return wxCONV_FAILED
;
1276 memcpy(dst
, src
, srcLen
);
1283 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1284 const wchar_t *src
, size_t srcLen
) const
1286 if ( srcLen
== wxNO_LEN
)
1287 srcLen
= wxWcslen(src
) + 1;
1289 srcLen
*= BYTES_PER_CHAR
;
1293 if ( dstLen
< srcLen
)
1294 return wxCONV_FAILED
;
1296 memcpy(dst
, src
, srcLen
);
1302 // ----------------------------------------------------------------------------
1303 // endian-reversing conversions
1304 // ----------------------------------------------------------------------------
1307 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1308 const char *src
, size_t srcLen
) const
1310 srcLen
= GetLength(src
, srcLen
);
1311 if ( srcLen
== wxNO_LEN
)
1312 return wxCONV_FAILED
;
1314 srcLen
/= BYTES_PER_CHAR
;
1318 if ( dstLen
< srcLen
)
1319 return wxCONV_FAILED
;
1321 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1322 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1324 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1332 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1333 const wchar_t *src
, size_t srcLen
) const
1335 if ( srcLen
== wxNO_LEN
)
1336 srcLen
= wxWcslen(src
) + 1;
1338 srcLen
*= BYTES_PER_CHAR
;
1342 if ( dstLen
< srcLen
)
1343 return wxCONV_FAILED
;
1345 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1346 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1348 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1355 #else // !WC_UTF16: wchar_t is UTF-32
1357 // ----------------------------------------------------------------------------
1358 // conversions without endianness change
1359 // ----------------------------------------------------------------------------
1362 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1363 const char *src
, size_t srcLen
) const
1365 srcLen
= GetLength(src
, srcLen
);
1366 if ( srcLen
== wxNO_LEN
)
1367 return wxCONV_FAILED
;
1369 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1372 // optimization: return maximal space which could be needed for this
1373 // string even if the real size could be smaller if the buffer contains
1379 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1380 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1382 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1384 return wxCONV_FAILED
;
1386 if ( ++outLen
> dstLen
)
1387 return wxCONV_FAILED
;
1397 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1398 const wchar_t *src
, size_t srcLen
) const
1400 if ( srcLen
== wxNO_LEN
)
1401 srcLen
= wxWcslen(src
) + 1;
1404 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1405 for ( size_t n
= 0; n
< srcLen
; n
++ )
1408 const size_t numChars
= encode_utf16(*src
++, cc
);
1409 if ( numChars
== wxCONV_FAILED
)
1410 return wxCONV_FAILED
;
1412 outLen
+= numChars
* BYTES_PER_CHAR
;
1415 if ( outLen
> dstLen
)
1416 return wxCONV_FAILED
;
1419 if ( numChars
== 2 )
1421 // second character of a surrogate
1430 // ----------------------------------------------------------------------------
1431 // endian-reversing conversions
1432 // ----------------------------------------------------------------------------
1435 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1436 const char *src
, size_t srcLen
) const
1438 srcLen
= GetLength(src
, srcLen
);
1439 if ( srcLen
== wxNO_LEN
)
1440 return wxCONV_FAILED
;
1442 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1445 // optimization: return maximal space which could be needed for this
1446 // string even if the real size could be smaller if the buffer contains
1452 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1453 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1458 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1460 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1462 const size_t numChars
= decode_utf16(tmp
, ch
);
1463 if ( numChars
== wxCONV_FAILED
)
1464 return wxCONV_FAILED
;
1466 if ( numChars
== 2 )
1469 if ( ++outLen
> dstLen
)
1470 return wxCONV_FAILED
;
1480 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1481 const wchar_t *src
, size_t srcLen
) const
1483 if ( srcLen
== wxNO_LEN
)
1484 srcLen
= wxWcslen(src
) + 1;
1487 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1488 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1491 const size_t numChars
= encode_utf16(*src
, cc
);
1492 if ( numChars
== wxCONV_FAILED
)
1493 return wxCONV_FAILED
;
1495 outLen
+= numChars
* BYTES_PER_CHAR
;
1498 if ( outLen
> dstLen
)
1499 return wxCONV_FAILED
;
1501 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1502 if ( numChars
== 2 )
1504 // second character of a surrogate
1505 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1513 #endif // WC_UTF16/!WC_UTF16
1516 // ============================================================================
1518 // ============================================================================
1520 #ifdef WORDS_BIGENDIAN
1521 #define wxMBConvUTF32straight wxMBConvUTF32BE
1522 #define wxMBConvUTF32swap wxMBConvUTF32LE
1524 #define wxMBConvUTF32swap wxMBConvUTF32BE
1525 #define wxMBConvUTF32straight wxMBConvUTF32LE
1529 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1530 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1533 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1535 if ( srcLen
== wxNO_LEN
)
1537 // count the number of bytes in input, including the trailing NULs
1538 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1539 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1542 srcLen
*= BYTES_PER_CHAR
;
1544 else // we already have the length
1546 // we can only convert an entire number of UTF-32 characters
1547 if ( srcLen
% BYTES_PER_CHAR
)
1548 return wxCONV_FAILED
;
1554 // case when in-memory representation is UTF-16
1557 // ----------------------------------------------------------------------------
1558 // conversions without endianness change
1559 // ----------------------------------------------------------------------------
1562 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1563 const char *src
, size_t srcLen
) const
1565 srcLen
= GetLength(src
, srcLen
);
1566 if ( srcLen
== wxNO_LEN
)
1567 return wxCONV_FAILED
;
1569 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1570 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1572 for ( size_t n
= 0; n
< inLen
; n
++ )
1575 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1576 if ( numChars
== wxCONV_FAILED
)
1577 return wxCONV_FAILED
;
1582 if ( outLen
> dstLen
)
1583 return wxCONV_FAILED
;
1586 if ( numChars
== 2 )
1588 // second character of a surrogate
1598 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1599 const wchar_t *src
, size_t srcLen
) const
1601 if ( srcLen
== wxNO_LEN
)
1602 srcLen
= wxWcslen(src
) + 1;
1606 // optimization: return maximal space which could be needed for this
1607 // string instead of the exact amount which could be less if there are
1608 // any surrogates in the input
1610 // we consider that surrogates are rare enough to make it worthwhile to
1611 // avoid running the loop below at the cost of slightly extra memory
1613 return srcLen
* BYTES_PER_CHAR
;
1616 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1618 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1620 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1622 return wxCONV_FAILED
;
1624 outLen
+= BYTES_PER_CHAR
;
1626 if ( outLen
> dstLen
)
1627 return wxCONV_FAILED
;
1635 // ----------------------------------------------------------------------------
1636 // endian-reversing conversions
1637 // ----------------------------------------------------------------------------
1640 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1641 const char *src
, size_t srcLen
) const
1643 srcLen
= GetLength(src
, srcLen
);
1644 if ( srcLen
== wxNO_LEN
)
1645 return wxCONV_FAILED
;
1647 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1648 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1650 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1653 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1654 if ( numChars
== wxCONV_FAILED
)
1655 return wxCONV_FAILED
;
1660 if ( outLen
> dstLen
)
1661 return wxCONV_FAILED
;
1664 if ( numChars
== 2 )
1666 // second character of a surrogate
1676 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1677 const wchar_t *src
, size_t srcLen
) const
1679 if ( srcLen
== wxNO_LEN
)
1680 srcLen
= wxWcslen(src
) + 1;
1684 // optimization: return maximal space which could be needed for this
1685 // string instead of the exact amount which could be less if there are
1686 // any surrogates in the input
1688 // we consider that surrogates are rare enough to make it worthwhile to
1689 // avoid running the loop below at the cost of slightly extra memory
1691 return srcLen
*BYTES_PER_CHAR
;
1694 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1696 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1698 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1700 return wxCONV_FAILED
;
1702 outLen
+= BYTES_PER_CHAR
;
1704 if ( outLen
> dstLen
)
1705 return wxCONV_FAILED
;
1707 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1713 #else // !WC_UTF16: wchar_t is UTF-32
1715 // ----------------------------------------------------------------------------
1716 // conversions without endianness change
1717 // ----------------------------------------------------------------------------
1720 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1721 const char *src
, size_t srcLen
) const
1723 // use memcpy() as it should be much faster than hand-written loop
1724 srcLen
= GetLength(src
, srcLen
);
1725 if ( srcLen
== wxNO_LEN
)
1726 return wxCONV_FAILED
;
1728 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1731 if ( dstLen
< inLen
)
1732 return wxCONV_FAILED
;
1734 memcpy(dst
, src
, srcLen
);
1741 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1742 const wchar_t *src
, size_t srcLen
) const
1744 if ( srcLen
== wxNO_LEN
)
1745 srcLen
= wxWcslen(src
) + 1;
1747 srcLen
*= BYTES_PER_CHAR
;
1751 if ( dstLen
< srcLen
)
1752 return wxCONV_FAILED
;
1754 memcpy(dst
, src
, srcLen
);
1760 // ----------------------------------------------------------------------------
1761 // endian-reversing conversions
1762 // ----------------------------------------------------------------------------
1765 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1766 const char *src
, size_t srcLen
) const
1768 srcLen
= GetLength(src
, srcLen
);
1769 if ( srcLen
== wxNO_LEN
)
1770 return wxCONV_FAILED
;
1772 srcLen
/= BYTES_PER_CHAR
;
1776 if ( dstLen
< srcLen
)
1777 return wxCONV_FAILED
;
1779 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1780 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1782 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1790 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1791 const wchar_t *src
, size_t srcLen
) const
1793 if ( srcLen
== wxNO_LEN
)
1794 srcLen
= wxWcslen(src
) + 1;
1796 srcLen
*= BYTES_PER_CHAR
;
1800 if ( dstLen
< srcLen
)
1801 return wxCONV_FAILED
;
1803 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1804 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1806 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1813 #endif // WC_UTF16/!WC_UTF16
1816 // ============================================================================
1817 // The classes doing conversion using the iconv_xxx() functions
1818 // ============================================================================
1822 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1823 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1824 // (unless there's yet another bug in glibc) the only case when iconv()
1825 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1826 // left in the input buffer -- when _real_ error occurs,
1827 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1829 // [This bug does not appear in glibc 2.2.]
1830 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1831 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1832 (errno != E2BIG || bufLeft != 0))
1834 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1837 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1839 #define ICONV_T_INVALID ((iconv_t)-1)
1841 #if SIZEOF_WCHAR_T == 4
1842 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1843 #define WC_ENC wxFONTENCODING_UTF32
1844 #elif SIZEOF_WCHAR_T == 2
1845 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1846 #define WC_ENC wxFONTENCODING_UTF16
1847 #else // sizeof(wchar_t) != 2 nor 4
1848 // does this ever happen?
1849 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1852 // ----------------------------------------------------------------------------
1853 // wxMBConv_iconv: encapsulates an iconv character set
1854 // ----------------------------------------------------------------------------
1856 class wxMBConv_iconv
: public wxMBConv
1859 wxMBConv_iconv(const char *name
);
1860 virtual ~wxMBConv_iconv();
1862 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1863 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1865 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1866 virtual size_t GetMBNulLen() const;
1868 #if wxUSE_UNICODE_UTF8
1869 virtual bool IsUTF8() const;
1872 virtual wxMBConv
*Clone() const
1874 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
1875 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1880 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1883 // the iconv handlers used to translate from multibyte
1884 // to wide char and in the other direction
1889 // guards access to m2w and w2m objects
1890 wxMutex m_iconvMutex
;
1894 // the name (for iconv_open()) of a wide char charset -- if none is
1895 // available on this machine, it will remain NULL
1896 static wxString ms_wcCharsetName
;
1898 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1899 // different endian-ness than the native one
1900 static bool ms_wcNeedsSwap
;
1903 // name of the encoding handled by this conversion
1906 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1908 size_t m_minMBCharWidth
;
1911 // make the constructor available for unit testing
1912 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
1914 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1915 if ( !result
->IsOk() )
1924 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1925 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1927 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
1930 m_minMBCharWidth
= 0;
1932 // check for charset that represents wchar_t:
1933 if ( ms_wcCharsetName
.empty() )
1935 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1938 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1939 #else // !wxUSE_FONTMAP
1940 static const wxChar
*names_static
[] =
1942 #if SIZEOF_WCHAR_T == 4
1944 #elif SIZEOF_WCHAR_T = 2
1949 const wxChar
**names
= names_static
;
1950 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1952 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1954 const wxString
nameCS(*names
);
1956 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1957 wxString
nameXE(nameCS
);
1959 #ifdef WORDS_BIGENDIAN
1961 #else // little endian
1965 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1968 m2w
= iconv_open(nameXE
.ToAscii(), name
);
1969 if ( m2w
== ICONV_T_INVALID
)
1971 // try charset w/o bytesex info (e.g. "UCS4")
1972 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1974 m2w
= iconv_open(nameCS
.ToAscii(), name
);
1976 // and check for bytesex ourselves:
1977 if ( m2w
!= ICONV_T_INVALID
)
1979 char buf
[2], *bufPtr
;
1980 wchar_t wbuf
[2], *wbufPtr
;
1988 outsz
= SIZEOF_WCHAR_T
* 2;
1993 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1994 (char**)&wbufPtr
, &outsz
);
1996 if (ICONV_FAILED(res
, insz
))
1998 wxLogLastError(wxT("iconv"));
1999 wxLogError(_("Conversion to charset '%s' doesn't work."),
2002 else // ok, can convert to this encoding, remember it
2004 ms_wcCharsetName
= nameCS
;
2005 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2009 else // use charset not requiring byte swapping
2011 ms_wcCharsetName
= nameXE
;
2015 wxLogTrace(TRACE_STRCONV
,
2016 wxT("iconv wchar_t charset is \"%s\"%s"),
2017 ms_wcCharsetName
.empty() ? wxString("<none>")
2019 ms_wcNeedsSwap
? _T(" (needs swap)")
2022 else // we already have ms_wcCharsetName
2024 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2027 if ( ms_wcCharsetName
.empty() )
2029 w2m
= ICONV_T_INVALID
;
2033 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2034 if ( w2m
== ICONV_T_INVALID
)
2036 wxLogTrace(TRACE_STRCONV
,
2037 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2038 ms_wcCharsetName
.c_str(), name
);
2043 wxMBConv_iconv::~wxMBConv_iconv()
2045 if ( m2w
!= ICONV_T_INVALID
)
2047 if ( w2m
!= ICONV_T_INVALID
)
2051 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2053 // find the string length: notice that must be done differently for
2054 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2056 const size_t nulLen
= GetMBNulLen();
2060 return wxCONV_FAILED
;
2063 inbuf
= strlen(psz
); // arguably more optimized than our version
2068 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2069 // they also have to start at character boundary and not span two
2070 // adjacent characters
2072 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2079 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2080 // Unfortunately there are a couple of global wxCSConv objects such as
2081 // wxConvLocal that are used all over wx code, so we have to make sure
2082 // the handle is used by at most one thread at the time. Otherwise
2083 // only a few wx classes would be safe to use from non-main threads
2084 // as MB<->WC conversion would fail "randomly".
2085 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2086 #endif // wxUSE_THREADS
2088 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
2090 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2091 wchar_t *bufPtr
= buf
;
2092 const char *pszPtr
= psz
;
2096 // have destination buffer, convert there
2098 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
2099 (char**)&bufPtr
, &outbuf
);
2100 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
2104 // convert to native endianness
2105 for ( unsigned i
= 0; i
< res
; i
++ )
2106 buf
[n
] = WC_BSWAP(buf
[i
]);
2109 // NUL-terminate the string if there is any space left
2115 // no destination buffer... convert using temp buffer
2116 // to calculate destination buffer requirement
2123 outbuf
= 8 * SIZEOF_WCHAR_T
;
2126 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
2127 (char**)&bufPtr
, &outbuf
);
2129 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
2131 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2134 if (ICONV_FAILED(cres
, inbuf
))
2136 //VS: it is ok if iconv fails, hence trace only
2137 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2138 return wxCONV_FAILED
;
2144 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2147 // NB: explained in MB2WC
2148 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2151 size_t inlen
= wxWcslen(psz
);
2152 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
2156 wchar_t *tmpbuf
= 0;
2160 // need to copy to temp buffer to switch endianness
2161 // (doing WC_BSWAP twice on the original buffer won't help, as it
2162 // could be in read-only memory, or be accessed in some other thread)
2163 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
2164 for ( size_t i
= 0; i
< inlen
; i
++ )
2165 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
2167 tmpbuf
[inlen
] = L
'\0';
2173 // have destination buffer, convert there
2174 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
2178 // NB: iconv was given only wcslen(psz) characters on input, and so
2179 // it couldn't convert the trailing zero. Let's do it ourselves
2180 // if there's some room left for it in the output buffer.
2186 // no destination buffer: convert using temp buffer
2187 // to calculate destination buffer requirement
2195 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
2199 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2207 if (ICONV_FAILED(cres
, inbuf
))
2209 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2210 return wxCONV_FAILED
;
2216 size_t wxMBConv_iconv::GetMBNulLen() const
2218 if ( m_minMBCharWidth
== 0 )
2220 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2223 // NB: explained in MB2WC
2224 wxMutexLocker
lock(self
->m_iconvMutex
);
2227 const wchar_t *wnul
= L
"";
2228 char buf
[8]; // should be enough for NUL in any encoding
2229 size_t inLen
= sizeof(wchar_t),
2230 outLen
= WXSIZEOF(buf
);
2231 char *inBuff
= (char *)wnul
;
2232 char *outBuff
= buf
;
2233 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2235 self
->m_minMBCharWidth
= (size_t)-1;
2239 self
->m_minMBCharWidth
= outBuff
- buf
;
2243 return m_minMBCharWidth
;
2246 #if wxUSE_UNICODE_UTF8
2247 bool wxMBConv_iconv::IsUTF8() const
2249 return wxStricmp(m_name
, "UTF-8") == 0 ||
2250 wxStricmp(m_name
, "UTF8") == 0;
2254 #endif // HAVE_ICONV
2257 // ============================================================================
2258 // Win32 conversion classes
2259 // ============================================================================
2261 #ifdef wxHAVE_WIN32_MB2WC
2265 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2266 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2269 class wxMBConv_win32
: public wxMBConv
2274 m_CodePage
= CP_ACP
;
2275 m_minMBCharWidth
= 0;
2278 wxMBConv_win32(const wxMBConv_win32
& conv
)
2281 m_CodePage
= conv
.m_CodePage
;
2282 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2286 wxMBConv_win32(const char* name
)
2288 m_CodePage
= wxCharsetToCodepage(name
);
2289 m_minMBCharWidth
= 0;
2292 wxMBConv_win32(wxFontEncoding encoding
)
2294 m_CodePage
= wxEncodingToCodepage(encoding
);
2295 m_minMBCharWidth
= 0;
2297 #endif // wxUSE_FONTMAP
2299 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2301 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2302 // the behaviour is not compatible with the Unix version (using iconv)
2303 // and break the library itself, e.g. wxTextInputStream::NextChar()
2304 // wouldn't work if reading an incomplete MB char didn't result in an
2307 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2308 // Win XP or newer and it is not supported for UTF-[78] so we always
2309 // use our own conversions in this case. See
2310 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2311 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2312 if ( m_CodePage
== CP_UTF8
)
2314 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2317 if ( m_CodePage
== CP_UTF7
)
2319 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2323 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2324 IsAtLeastWin2kSP4() )
2326 flags
= MB_ERR_INVALID_CHARS
;
2329 const size_t len
= ::MultiByteToWideChar
2331 m_CodePage
, // code page
2332 flags
, // flags: fall on error
2333 psz
, // input string
2334 -1, // its length (NUL-terminated)
2335 buf
, // output string
2336 buf
? n
: 0 // size of output buffer
2340 // function totally failed
2341 return wxCONV_FAILED
;
2344 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2345 // check if we succeeded, by doing a double trip:
2346 if ( !flags
&& buf
)
2348 const size_t mbLen
= strlen(psz
);
2349 wxCharBuffer
mbBuf(mbLen
);
2350 if ( ::WideCharToMultiByte
2357 mbLen
+ 1, // size in bytes, not length
2361 strcmp(mbBuf
, psz
) != 0 )
2363 // we didn't obtain the same thing we started from, hence
2364 // the conversion was lossy and we consider that it failed
2365 return wxCONV_FAILED
;
2369 // note that it returns count of written chars for buf != NULL and size
2370 // of the needed buffer for buf == NULL so in either case the length of
2371 // the string (which never includes the terminating NUL) is one less
2375 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2378 we have a problem here: by default, WideCharToMultiByte() may
2379 replace characters unrepresentable in the target code page with bad
2380 quality approximations such as turning "1/2" symbol (U+00BD) into
2381 "1" for the code pages which don't have it and we, obviously, want
2382 to avoid this at any price
2384 the trouble is that this function does it _silently_, i.e. it won't
2385 even tell us whether it did or not... Win98/2000 and higher provide
2386 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2387 we have to resort to a round trip, i.e. check that converting back
2388 results in the same string -- this is, of course, expensive but
2389 otherwise we simply can't be sure to not garble the data.
2392 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2393 // it doesn't work with CJK encodings (which we test for rather roughly
2394 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2396 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2399 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2401 // it's our lucky day
2402 flags
= WC_NO_BEST_FIT_CHARS
;
2403 pUsedDef
= &usedDef
;
2405 else // old system or unsupported encoding
2411 const size_t len
= ::WideCharToMultiByte
2413 m_CodePage
, // code page
2414 flags
, // either none or no best fit
2415 pwz
, // input string
2416 -1, // it is (wide) NUL-terminated
2417 buf
, // output buffer
2418 buf
? n
: 0, // and its size
2419 NULL
, // default "replacement" char
2420 pUsedDef
// [out] was it used?
2425 // function totally failed
2426 return wxCONV_FAILED
;
2429 // we did something, check if we really succeeded
2432 // check if the conversion failed, i.e. if any replacements
2435 return wxCONV_FAILED
;
2437 else // we must resort to double tripping...
2439 // first we need to ensure that we really have the MB data: this is
2440 // not the case if we're called with NULL buffer, in which case we
2441 // need to do the conversion yet again
2442 wxCharBuffer bufDef
;
2445 bufDef
= wxCharBuffer(len
);
2446 buf
= bufDef
.data();
2447 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2448 buf
, len
, NULL
, NULL
) )
2449 return wxCONV_FAILED
;
2452 wxWCharBuffer
wcBuf(n
);
2453 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2454 wcscmp(wcBuf
, pwz
) != 0 )
2456 // we didn't obtain the same thing we started from, hence
2457 // the conversion was lossy and we consider that it failed
2458 return wxCONV_FAILED
;
2462 // see the comment above for the reason of "len - 1"
2466 virtual size_t GetMBNulLen() const
2468 if ( m_minMBCharWidth
== 0 )
2470 int len
= ::WideCharToMultiByte
2472 m_CodePage
, // code page
2474 L
"", // input string
2475 1, // translate just the NUL
2476 NULL
, // output buffer
2478 NULL
, // no replacement char
2479 NULL
// [out] don't care if it was used
2482 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2486 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2487 self
->m_minMBCharWidth
= (size_t)-1;
2491 self
->m_minMBCharWidth
= (size_t)-1;
2497 self
->m_minMBCharWidth
= len
;
2502 return m_minMBCharWidth
;
2505 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2507 bool IsOk() const { return m_CodePage
!= -1; }
2510 static bool CanUseNoBestFit()
2512 static int s_isWin98Or2k
= -1;
2514 if ( s_isWin98Or2k
== -1 )
2517 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2519 case wxOS_WINDOWS_9X
:
2520 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2523 case wxOS_WINDOWS_NT
:
2524 s_isWin98Or2k
= verMaj
>= 5;
2528 // unknown: be conservative by default
2533 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2536 return s_isWin98Or2k
== 1;
2539 static bool IsAtLeastWin2kSP4()
2544 static int s_isAtLeastWin2kSP4
= -1;
2546 if ( s_isAtLeastWin2kSP4
== -1 )
2548 OSVERSIONINFOEX ver
;
2550 memset(&ver
, 0, sizeof(ver
));
2551 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2552 GetVersionEx((OSVERSIONINFO
*)&ver
);
2554 s_isAtLeastWin2kSP4
=
2555 ((ver
.dwMajorVersion
> 5) || // Vista+
2556 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2557 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2558 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2562 return s_isAtLeastWin2kSP4
== 1;
2567 // the code page we're working with
2570 // cached result of GetMBNulLen(), set to 0 initially meaning
2572 size_t m_minMBCharWidth
;
2575 #endif // wxHAVE_WIN32_MB2WC
2578 // ============================================================================
2579 // wxEncodingConverter based conversion classes
2580 // ============================================================================
2584 class wxMBConv_wxwin
: public wxMBConv
2589 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2590 // The wxMBConv_cf class does a better job.
2591 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2592 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2593 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2597 // temporarily just use wxEncodingConverter stuff,
2598 // so that it works while a better implementation is built
2599 wxMBConv_wxwin(const char* name
)
2602 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2604 m_enc
= wxFONTENCODING_SYSTEM
;
2609 wxMBConv_wxwin(wxFontEncoding enc
)
2616 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2618 size_t inbuf
= strlen(psz
);
2621 if (!m2w
.Convert(psz
, buf
))
2622 return wxCONV_FAILED
;
2627 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2629 const size_t inbuf
= wxWcslen(psz
);
2632 if (!w2m
.Convert(psz
, buf
))
2633 return wxCONV_FAILED
;
2639 virtual size_t GetMBNulLen() const
2643 case wxFONTENCODING_UTF16BE
:
2644 case wxFONTENCODING_UTF16LE
:
2647 case wxFONTENCODING_UTF32BE
:
2648 case wxFONTENCODING_UTF32LE
:
2656 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2658 bool IsOk() const { return m_ok
; }
2661 wxFontEncoding m_enc
;
2662 wxEncodingConverter m2w
, w2m
;
2665 // were we initialized successfully?
2668 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2671 // make the constructors available for unit testing
2672 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2674 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2675 if ( !result
->IsOk() )
2684 #endif // wxUSE_FONTMAP
2686 // ============================================================================
2687 // wxCSConv implementation
2688 // ============================================================================
2690 void wxCSConv::Init()
2697 wxCSConv::wxCSConv(const wxString
& charset
)
2701 if ( !charset
.empty() )
2703 SetName(charset
.ToAscii());
2707 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2709 m_encoding
= wxFONTENCODING_SYSTEM
;
2713 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2715 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2717 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2719 encoding
= wxFONTENCODING_SYSTEM
;
2724 m_encoding
= encoding
;
2727 wxCSConv::~wxCSConv()
2732 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2737 SetName(conv
.m_name
);
2738 m_encoding
= conv
.m_encoding
;
2741 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2745 SetName(conv
.m_name
);
2746 m_encoding
= conv
.m_encoding
;
2751 void wxCSConv::Clear()
2760 void wxCSConv::SetName(const char *charset
)
2764 m_name
= wxStrdup(charset
);
2771 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2772 wxEncodingNameCache
);
2774 static wxEncodingNameCache gs_nameCache
;
2777 wxMBConv
*wxCSConv::DoCreate() const
2780 wxLogTrace(TRACE_STRCONV
,
2781 wxT("creating conversion for %s"),
2783 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2784 #endif // wxUSE_FONTMAP
2786 // check for the special case of ASCII or ISO8859-1 charset: as we have
2787 // special knowledge of it anyhow, we don't need to create a special
2788 // conversion object
2789 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2790 m_encoding
== wxFONTENCODING_DEFAULT
)
2792 // don't convert at all
2796 // we trust OS to do conversion better than we can so try external
2797 // conversion methods first
2799 // the full order is:
2800 // 1. OS conversion (iconv() under Unix or Win32 API)
2801 // 2. hard coded conversions for UTF
2802 // 3. wxEncodingConverter as fall back
2808 #endif // !wxUSE_FONTMAP
2811 wxFontEncoding
encoding(m_encoding
);
2816 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2824 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2825 #endif // wxUSE_FONTMAP
2829 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2830 if ( it
!= gs_nameCache
.end() )
2832 if ( it
->second
.empty() )
2835 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
2842 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2843 // CS : in case this does not return valid names (eg for MacRoman)
2844 // encoding got a 'failure' entry in the cache all the same,
2845 // although it just has to be created using a different method, so
2846 // only store failed iconv creation attempts (or perhaps we
2847 // shoulnd't do this at all ?)
2848 if ( names
[0] != NULL
)
2850 for ( ; *names
; ++names
)
2852 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2853 // will need changes that will obsolete this
2854 wxString
name(*names
);
2855 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
2858 gs_nameCache
[encoding
] = *names
;
2865 gs_nameCache
[encoding
] = _T(""); // cache the failure
2868 #endif // wxUSE_FONTMAP
2870 #endif // HAVE_ICONV
2872 #ifdef wxHAVE_WIN32_MB2WC
2875 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2876 : new wxMBConv_win32(m_encoding
);
2885 #endif // wxHAVE_WIN32_MB2WC
2889 // leave UTF16 and UTF32 to the built-ins of wx
2890 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2891 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2894 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
2895 : new wxMBConv_cf(m_encoding
);
2897 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
2906 #endif // __DARWIN__
2909 wxFontEncoding enc
= m_encoding
;
2911 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2913 // use "false" to suppress interactive dialogs -- we can be called from
2914 // anywhere and popping up a dialog from here is the last thing we want to
2916 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2918 #endif // wxUSE_FONTMAP
2922 case wxFONTENCODING_UTF7
:
2923 return new wxMBConvUTF7
;
2925 case wxFONTENCODING_UTF8
:
2926 return new wxMBConvUTF8
;
2928 case wxFONTENCODING_UTF16BE
:
2929 return new wxMBConvUTF16BE
;
2931 case wxFONTENCODING_UTF16LE
:
2932 return new wxMBConvUTF16LE
;
2934 case wxFONTENCODING_UTF32BE
:
2935 return new wxMBConvUTF32BE
;
2937 case wxFONTENCODING_UTF32LE
:
2938 return new wxMBConvUTF32LE
;
2941 // nothing to do but put here to suppress gcc warnings
2948 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2949 : new wxMBConv_wxwin(m_encoding
);
2955 #endif // wxUSE_FONTMAP
2957 // NB: This is a hack to prevent deadlock. What could otherwise happen
2958 // in Unicode build: wxConvLocal creation ends up being here
2959 // because of some failure and logs the error. But wxLog will try to
2960 // attach a timestamp, for which it will need wxConvLocal (to convert
2961 // time to char* and then wchar_t*), but that fails, tries to log the
2962 // error, but wxLog has an (already locked) critical section that
2963 // guards the static buffer.
2964 static bool alreadyLoggingError
= false;
2965 if (!alreadyLoggingError
)
2967 alreadyLoggingError
= true;
2968 wxLogError(_("Cannot convert from the charset '%s'!"),
2972 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
2973 #else // !wxUSE_FONTMAP
2974 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
2975 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2978 alreadyLoggingError
= false;
2984 void wxCSConv::CreateConvIfNeeded() const
2988 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
2990 // if we don't have neither the name nor the encoding, use the default
2991 // encoding for this system
2992 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
2995 self
->m_encoding
= wxLocale::GetSystemEncoding();
2997 // fallback to some reasonable default:
2998 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
2999 #endif // wxUSE_INTL
3002 self
->m_convReal
= DoCreate();
3003 self
->m_deferred
= false;
3007 bool wxCSConv::IsOk() const
3009 CreateConvIfNeeded();
3011 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3012 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3013 return true; // always ok as we do it ourselves
3015 // m_convReal->IsOk() is called at its own creation, so we know it must
3016 // be ok if m_convReal is non-NULL
3017 return m_convReal
!= NULL
;
3020 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3021 const char *src
, size_t srcLen
) const
3023 CreateConvIfNeeded();
3026 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3029 return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
);
3032 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3033 const wchar_t *src
, size_t srcLen
) const
3035 CreateConvIfNeeded();
3038 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3041 return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
);
3044 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3046 CreateConvIfNeeded();
3049 return m_convReal
->MB2WC(buf
, psz
, n
);
3052 size_t len
= strlen(psz
);
3056 for (size_t c
= 0; c
<= len
; c
++)
3057 buf
[c
] = (unsigned char)(psz
[c
]);
3063 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3065 CreateConvIfNeeded();
3068 return m_convReal
->WC2MB(buf
, psz
, n
);
3071 const size_t len
= wxWcslen(psz
);
3074 for (size_t c
= 0; c
<= len
; c
++)
3077 return wxCONV_FAILED
;
3079 buf
[c
] = (char)psz
[c
];
3084 for (size_t c
= 0; c
<= len
; c
++)
3087 return wxCONV_FAILED
;
3094 size_t wxCSConv::GetMBNulLen() const
3096 CreateConvIfNeeded();
3100 return m_convReal
->GetMBNulLen();
3103 // otherwise, we are ISO-8859-1
3107 #if wxUSE_UNICODE_UTF8
3108 bool wxCSConv::IsUTF8() const
3110 CreateConvIfNeeded();
3114 return m_convReal
->IsUTF8();
3117 // otherwise, we are ISO-8859-1
3125 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3128 return wxWCharBuffer();
3130 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3132 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3134 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3139 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3142 return wxCharBuffer();
3144 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3146 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3151 #endif // wxUSE_UNICODE
3153 // ----------------------------------------------------------------------------
3155 // ----------------------------------------------------------------------------
3157 // NB: The reason why we create converted objects in this convoluted way,
3158 // using a factory function instead of global variable, is that they
3159 // may be used at static initialization time (some of them are used by
3160 // wxString ctors and there may be a global wxString object). In other
3161 // words, possibly _before_ the converter global object would be
3168 #undef wxConvISO8859_1
3170 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3171 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3172 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3174 static impl_klass name##Obj ctor_args; \
3175 return &name##Obj; \
3177 /* this ensures that all global converter objects are created */ \
3178 /* by the time static initialization is done, i.e. before any */ \
3179 /* thread is launched: */ \
3180 static klass* gs_##name##instance = wxGet_##name##Ptr()
3182 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3183 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3186 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3188 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3191 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, wxEMPTY_PARAMETER_VALUE
);
3192 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, wxEMPTY_PARAMETER_VALUE
);
3194 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3195 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3197 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3198 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3201 // The xnu kernel always communicates file paths in decomposed UTF-8.
3202 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3203 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3206 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3209 #else // !__DARWIN__
3210 wxGet_wxConvLibcPtr();
3211 #endif // __DARWIN__/!__DARWIN__
3213 #else // !wxUSE_WCHAR_T
3215 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3216 // stand-ins in absence of wchar_t
3217 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3222 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T