1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
53 #include "wx/thread.h"
56 #include "wx/encconv.h"
57 #include "wx/fontmap.h"
60 #include "wx/mac/corefoundation/private/strconv_cf.h"
61 #endif //def __DARWIN__
64 #define TRACE_STRCONV _T("strconv")
66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
68 #if SIZEOF_WCHAR_T == 2
73 // ============================================================================
75 // ============================================================================
77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
78 static bool NotAllNULs(const char *p
, size_t n
)
80 while ( n
&& *p
++ == '\0' )
86 // ----------------------------------------------------------------------------
87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
88 // ----------------------------------------------------------------------------
90 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
95 *output
= (wxUint16
) input
;
99 else if (input
>= 0x110000)
101 return wxCONV_FAILED
;
107 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
108 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
115 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
117 if ((*input
< 0xd800) || (*input
> 0xdfff))
122 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
125 return wxCONV_FAILED
;
129 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
135 typedef wchar_t wxDecodeSurrogate_t
;
137 typedef wxUint16 wxDecodeSurrogate_t
;
138 #endif // WC_UTF16/!WC_UTF16
140 // returns the next UTF-32 character from the wchar_t buffer and advances the
141 // pointer to the character after this one
143 // if an invalid character is found, *pSrc is set to NULL, the caller must
145 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
149 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
150 if ( n
== wxCONV_FAILED
)
158 // ----------------------------------------------------------------------------
160 // ----------------------------------------------------------------------------
163 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
164 const char *src
, size_t srcLen
) const
166 // although new conversion classes are supposed to implement this function
167 // directly, the existins ones only implement the old MB2WC() and so, to
168 // avoid to have to rewrite all conversion classes at once, we provide a
169 // default (but not efficient) implementation of this one in terms of the
170 // old function by copying the input to ensure that it's NUL-terminated and
171 // then using MB2WC() to convert it
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten
= 0;
176 // the number of NULs terminating this string
177 size_t nulLen
= 0; // not really needed, but just to avoid warnings
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
185 if ( srcLen
!= wxNO_LEN
)
187 // we need to know how to find the end of this string
188 nulLen
= GetMBNulLen();
189 if ( nulLen
== wxCONV_FAILED
)
190 return wxCONV_FAILED
;
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
197 char * const p
= bufTmp
.data();
198 memcpy(p
, src
, srcLen
);
199 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
205 srcEnd
= src
+ srcLen
;
207 else // quit after the first loop iteration
214 // try to convert the current chunk
215 size_t lenChunk
= MB2WC(NULL
, src
, 0);
216 if ( lenChunk
== wxCONV_FAILED
)
217 return wxCONV_FAILED
;
219 lenChunk
++; // for the L'\0' at the end of this chunk
221 dstWritten
+= lenChunk
;
225 // nothing left in the input string, conversion succeeded
231 if ( dstWritten
> dstLen
)
232 return wxCONV_FAILED
;
234 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
235 return wxCONV_FAILED
;
242 // we convert just one chunk in this case as this is the entire
247 // advance the input pointer past the end of this chunk
248 while ( NotAllNULs(src
, nulLen
) )
250 // notice that we must skip over multiple bytes here as we suppose
251 // that if NUL takes 2 or 4 bytes, then all the other characters do
252 // too and so if advanced by a single byte we might erroneously
253 // detect sequences of NUL bytes in the middle of the input
257 src
+= nulLen
; // skipping over its terminator as well
259 // note that ">=" (and not just "==") is needed here as the terminator
260 // we skipped just above could be inside or just after the buffer
261 // delimited by inEnd
270 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
271 const wchar_t *src
, size_t srcLen
) const
273 // the number of chars [which would be] written to dst [if it were not NULL]
274 size_t dstWritten
= 0;
276 // make a copy of the input string unless it is already properly
279 // if we don't know its length we have no choice but to assume that it is,
280 // indeed, properly terminated
281 wxWCharBuffer bufTmp
;
282 if ( srcLen
== wxNO_LEN
)
284 srcLen
= wxWcslen(src
) + 1;
286 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
288 // make a copy in order to properly NUL-terminate the string
289 bufTmp
= wxWCharBuffer(srcLen
);
290 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
294 const size_t lenNul
= GetMBNulLen();
295 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
297 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
299 // try to convert the current chunk
300 size_t lenChunk
= WC2MB(NULL
, src
, 0);
302 if ( lenChunk
== wxCONV_FAILED
)
303 return wxCONV_FAILED
;
306 dstWritten
+= lenChunk
;
310 if ( dstWritten
> dstLen
)
311 return wxCONV_FAILED
;
313 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
314 return wxCONV_FAILED
;
323 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
325 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
326 if ( rc
!= wxCONV_FAILED
)
328 // ToWChar() returns the buffer length, i.e. including the trailing
329 // NUL, while this method doesn't take it into account
336 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
338 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
339 if ( rc
!= wxCONV_FAILED
)
347 wxMBConv::~wxMBConv()
349 // nothing to do here (necessary for Darwin linking probably)
352 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
356 // calculate the length of the buffer needed first
357 const size_t nLen
= ToWChar(NULL
, 0, psz
);
358 if ( nLen
!= wxCONV_FAILED
)
360 // now do the actual conversion
361 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
363 // +1 for the trailing NULL
364 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
369 return wxWCharBuffer();
372 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
376 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
377 if ( nLen
!= wxCONV_FAILED
)
379 wxCharBuffer
buf(nLen
- 1);
380 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
385 return wxCharBuffer();
389 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
391 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
392 if ( dstLen
!= wxCONV_FAILED
)
394 wxWCharBuffer
wbuf(dstLen
- 1);
395 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
400 if ( wbuf
[dstLen
- 1] == L
'\0' )
411 return wxWCharBuffer();
415 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
417 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
418 if ( dstLen
!= wxCONV_FAILED
)
420 // special case of empty input: can't allocate 0 size buffer below as
421 // wxCharBuffer insists on NUL-terminating it
422 wxCharBuffer
buf(dstLen
? dstLen
- 1 : 1);
423 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
429 const size_t nulLen
= GetMBNulLen();
430 if ( dstLen
>= nulLen
&&
431 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
433 // in this case the output is NUL-terminated and we're not
434 // supposed to count NUL
446 return wxCharBuffer();
449 // ----------------------------------------------------------------------------
451 // ----------------------------------------------------------------------------
453 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
455 return wxMB2WC(buf
, psz
, n
);
458 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
460 return wxWC2MB(buf
, psz
, n
);
463 // ----------------------------------------------------------------------------
464 // wxConvBrokenFileNames
465 // ----------------------------------------------------------------------------
469 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
471 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
472 wxStricmp(charset
, _T("UTF8")) == 0 )
473 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
475 m_conv
= new wxCSConv(charset
);
480 // ----------------------------------------------------------------------------
482 // ----------------------------------------------------------------------------
484 // Implementation (C) 2004 Fredrik Roubert
487 // BASE64 decoding table
489 static const unsigned char utf7unb64
[] =
491 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
492 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
497 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
498 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
500 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
501 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
502 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
504 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
505 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
506 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
525 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
529 while ( *psz
&& (!buf
|| (len
< n
)) )
531 unsigned char cc
= *psz
++;
539 else if (*psz
== '-')
547 else // start of BASE64 encoded string
551 for ( ok
= lsb
= false, d
= 0, l
= 0;
552 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
557 for (l
+= 6; l
>= 8; lsb
= !lsb
)
559 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
569 *buf
= (wchar_t)(c
<< 8);
578 // in valid UTF7 we should have valid characters after '+'
579 return wxCONV_FAILED
;
587 if ( buf
&& (len
< n
) )
594 // BASE64 encoding table
596 static const unsigned char utf7enb64
[] =
598 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
599 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
600 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
601 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
602 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
603 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
604 'w', 'x', 'y', 'z', '0', '1', '2', '3',
605 '4', '5', '6', '7', '8', '9', '+', '/'
609 // UTF-7 encoding table
611 // 0 - Set D (directly encoded characters)
612 // 1 - Set O (optional direct characters)
613 // 2 - whitespace characters (optional)
614 // 3 - special characters
616 static const unsigned char utf7encode
[128] =
618 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
619 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
620 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
622 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
624 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
628 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
632 while (*psz
&& ((!buf
) || (len
< n
)))
635 if (cc
< 0x80 && utf7encode
[cc
] < 1)
644 else if (((wxUint32
)cc
) > 0xffff)
646 // no surrogate pair generation (yet?)
647 return wxCONV_FAILED
;
658 // BASE64 encode string
659 unsigned int lsb
, d
, l
;
660 for (d
= 0, l
= 0; /*nothing*/; psz
++)
662 for (lsb
= 0; lsb
< 2; lsb
++)
665 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
667 for (l
+= 8; l
>= 6; )
671 *buf
++ = utf7enb64
[(d
>> l
) % 64];
677 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
684 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
696 if (buf
&& (len
< n
))
702 // ----------------------------------------------------------------------------
704 // ----------------------------------------------------------------------------
706 static const wxUint32 utf8_max
[]=
707 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
709 // boundaries of the private use area we use to (temporarily) remap invalid
710 // characters invalid in a UTF-8 encoded string
711 const wxUint32 wxUnicodePUA
= 0x100000;
712 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
714 // this table gives the length of the UTF-8 encoding from its first character:
715 const unsigned char tableUtf8Lengths
[256] = {
716 // single-byte sequences (ASCII):
717 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
726 // these are invalid:
727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
733 // two-byte sequences:
734 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
735 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
737 // three-byte sequences:
738 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
740 // four-byte sequences:
741 4, 4, 4, 4, 4, // F0..F4
743 // these are invalid again (5- or 6-byte
744 // sequences and sequences for code points
745 // above U+10FFFF, as restricted by RFC 3629):
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
750 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
751 const char *src
, size_t srcLen
) const
753 wchar_t *out
= dstLen
? dst
: NULL
;
756 if ( srcLen
== wxNO_LEN
)
757 srcLen
= strlen(src
) + 1;
759 for ( const char *p
= src
; ; p
++ )
761 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
763 // all done successfully, just add the trailing NULL if we are not
764 // using explicit length
765 if ( srcLen
== wxNO_LEN
)
781 unsigned char c
= *p
;
782 unsigned len
= tableUtf8Lengths
[c
];
786 if ( srcLen
< len
) // the test works for wxNO_LEN too
789 if ( srcLen
!= wxNO_LEN
)
792 if ( out
&& !dstLen
-- )
796 // Char. number range | UTF-8 octet sequence
797 // (hexadecimal) | (binary)
798 // ----------------------+---------------------------------------------
799 // 0000 0000 - 0000 007F | 0xxxxxxx
800 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
801 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
802 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
804 // Code point value is stored in bits marked with 'x', lowest-order bit
805 // of the value on the right side in the diagram above.
808 // mask to extract lead byte's value ('x' bits above), by sequence length:
809 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
811 // mask and value of lead byte's most significant bits, by length:
812 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
813 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
815 len
--; // it's more convenient to work with 0-based length here
817 // extract the lead byte's value bits:
818 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
821 wxUint32 code
= c
& leadValueMask
[len
];
823 // all remaining bytes, if any, are handled in the same way regardless of
824 // sequence's length:
828 if ( (c
& 0xC0) != 0x80 )
829 return wxCONV_FAILED
;
836 // cast is ok because wchar_t == wxUint16 if WC_UTF16
837 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
846 #endif // WC_UTF16/!WC_UTF16
854 return wxCONV_FAILED
;
858 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
859 const wchar_t *src
, size_t srcLen
) const
861 char *out
= dstLen
? dst
: NULL
;
864 for ( const wchar_t *wp
= src
; ; wp
++ )
866 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
--) )
868 // all done successfully, just add the trailing NULL if we are not
869 // using explicit length
870 if ( srcLen
== wxNO_LEN
)
889 // cast is ok for WC_UTF16
890 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
892 // skip the next char too as we decoded a surrogate
895 #else // wchar_t is UTF-32
896 code
= *wp
& 0x7fffffff;
911 else if ( code
<= 0x07FF )
919 // NB: this line takes 6 least significant bits, encodes them as
920 // 10xxxxxx and discards them so that the next byte can be encoded:
921 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
922 out
[0] = 0xC0 | code
;
925 else if ( code
< 0xFFFF )
933 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
934 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
935 out
[0] = 0xE0 | code
;
938 else if ( code
<= 0x10FFFF )
946 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
947 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
948 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
949 out
[0] = 0xF0 | code
;
954 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
967 // we only get here if an error occurs during decoding
968 return wxCONV_FAILED
;
971 size_t wxMBConvUTF8::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
973 if ( m_options
== MAP_INVALID_UTF8_NOT
)
974 return wxMBConvStrictUTF8::MB2WC(buf
, psz
, n
);
978 while (*psz
&& ((!buf
) || (len
< n
)))
980 const char *opsz
= psz
;
981 bool invalid
= false;
982 unsigned char cc
= *psz
++, fc
= cc
;
984 for (cnt
= 0; fc
& 0x80; cnt
++)
994 // escape the escape character for octal escapes
995 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
996 && cc
== '\\' && (!buf
|| len
< n
))
1008 // invalid UTF-8 sequence
1013 unsigned ocnt
= cnt
- 1;
1014 wxUint32 res
= cc
& (0x3f >> cnt
);
1018 if ((cc
& 0xC0) != 0x80)
1020 // invalid UTF-8 sequence
1026 res
= (res
<< 6) | (cc
& 0x3f);
1029 if (invalid
|| res
<= utf8_max
[ocnt
])
1031 // illegal UTF-8 encoding
1034 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1035 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1037 // if one of our PUA characters turns up externally
1038 // it must also be treated as an illegal sequence
1039 // (a bit like you have to escape an escape character)
1045 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1046 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1047 if (pa
== wxCONV_FAILED
)
1059 *buf
++ = (wchar_t)res
;
1061 #endif // WC_UTF16/!WC_UTF16
1067 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1069 while (opsz
< psz
&& (!buf
|| len
< n
))
1072 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1073 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1074 wxASSERT(pa
!= wxCONV_FAILED
);
1081 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1087 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1089 while (opsz
< psz
&& (!buf
|| len
< n
))
1091 if ( buf
&& len
+ 3 < n
)
1093 unsigned char on
= *opsz
;
1095 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1096 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1097 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1104 else // MAP_INVALID_UTF8_NOT
1106 return wxCONV_FAILED
;
1112 if (buf
&& (len
< n
))
1118 static inline bool isoctal(wchar_t wch
)
1120 return L
'0' <= wch
&& wch
<= L
'7';
1123 size_t wxMBConvUTF8::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
1125 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1126 return wxMBConvStrictUTF8::WC2MB(buf
, psz
, n
);
1130 while (*psz
&& ((!buf
) || (len
< n
)))
1135 // cast is ok for WC_UTF16
1136 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1137 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1139 cc
= (*psz
++) & 0x7fffffff;
1142 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1143 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1146 *buf
++ = (char)(cc
- wxUnicodePUA
);
1149 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1150 && cc
== L
'\\' && psz
[0] == L
'\\' )
1157 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1159 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1163 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1164 (psz
[1] - L
'0') * 010 +
1174 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1190 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1192 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1198 if (buf
&& (len
< n
))
1204 // ============================================================================
1206 // ============================================================================
1208 #ifdef WORDS_BIGENDIAN
1209 #define wxMBConvUTF16straight wxMBConvUTF16BE
1210 #define wxMBConvUTF16swap wxMBConvUTF16LE
1212 #define wxMBConvUTF16swap wxMBConvUTF16BE
1213 #define wxMBConvUTF16straight wxMBConvUTF16LE
1217 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1219 if ( srcLen
== wxNO_LEN
)
1221 // count the number of bytes in input, including the trailing NULs
1222 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1223 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1226 srcLen
*= BYTES_PER_CHAR
;
1228 else // we already have the length
1230 // we can only convert an entire number of UTF-16 characters
1231 if ( srcLen
% BYTES_PER_CHAR
)
1232 return wxCONV_FAILED
;
1238 // case when in-memory representation is UTF-16 too
1241 // ----------------------------------------------------------------------------
1242 // conversions without endianness change
1243 // ----------------------------------------------------------------------------
1246 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1247 const char *src
, size_t srcLen
) const
1249 // set up the scene for using memcpy() (which is presumably more efficient
1250 // than copying the bytes one by one)
1251 srcLen
= GetLength(src
, srcLen
);
1252 if ( srcLen
== wxNO_LEN
)
1253 return wxCONV_FAILED
;
1255 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1258 if ( dstLen
< inLen
)
1259 return wxCONV_FAILED
;
1261 memcpy(dst
, src
, srcLen
);
1268 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1269 const wchar_t *src
, size_t srcLen
) const
1271 if ( srcLen
== wxNO_LEN
)
1272 srcLen
= wxWcslen(src
) + 1;
1274 srcLen
*= BYTES_PER_CHAR
;
1278 if ( dstLen
< srcLen
)
1279 return wxCONV_FAILED
;
1281 memcpy(dst
, src
, srcLen
);
1287 // ----------------------------------------------------------------------------
1288 // endian-reversing conversions
1289 // ----------------------------------------------------------------------------
1292 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1293 const char *src
, size_t srcLen
) const
1295 srcLen
= GetLength(src
, srcLen
);
1296 if ( srcLen
== wxNO_LEN
)
1297 return wxCONV_FAILED
;
1299 srcLen
/= BYTES_PER_CHAR
;
1303 if ( dstLen
< srcLen
)
1304 return wxCONV_FAILED
;
1306 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1307 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1309 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1317 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1318 const wchar_t *src
, size_t srcLen
) const
1320 if ( srcLen
== wxNO_LEN
)
1321 srcLen
= wxWcslen(src
) + 1;
1323 srcLen
*= BYTES_PER_CHAR
;
1327 if ( dstLen
< srcLen
)
1328 return wxCONV_FAILED
;
1330 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1331 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1333 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1340 #else // !WC_UTF16: wchar_t is UTF-32
1342 // ----------------------------------------------------------------------------
1343 // conversions without endianness change
1344 // ----------------------------------------------------------------------------
1347 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1348 const char *src
, size_t srcLen
) const
1350 srcLen
= GetLength(src
, srcLen
);
1351 if ( srcLen
== wxNO_LEN
)
1352 return wxCONV_FAILED
;
1354 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1357 // optimization: return maximal space which could be needed for this
1358 // string even if the real size could be smaller if the buffer contains
1364 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1365 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1367 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1369 return wxCONV_FAILED
;
1371 if ( ++outLen
> dstLen
)
1372 return wxCONV_FAILED
;
1382 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1383 const wchar_t *src
, size_t srcLen
) const
1385 if ( srcLen
== wxNO_LEN
)
1386 srcLen
= wxWcslen(src
) + 1;
1389 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1390 for ( size_t n
= 0; n
< srcLen
; n
++ )
1393 const size_t numChars
= encode_utf16(*src
++, cc
);
1394 if ( numChars
== wxCONV_FAILED
)
1395 return wxCONV_FAILED
;
1397 outLen
+= numChars
* BYTES_PER_CHAR
;
1400 if ( outLen
> dstLen
)
1401 return wxCONV_FAILED
;
1404 if ( numChars
== 2 )
1406 // second character of a surrogate
1415 // ----------------------------------------------------------------------------
1416 // endian-reversing conversions
1417 // ----------------------------------------------------------------------------
1420 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1421 const char *src
, size_t srcLen
) const
1423 srcLen
= GetLength(src
, srcLen
);
1424 if ( srcLen
== wxNO_LEN
)
1425 return wxCONV_FAILED
;
1427 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1430 // optimization: return maximal space which could be needed for this
1431 // string even if the real size could be smaller if the buffer contains
1437 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1438 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1443 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1445 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1447 const size_t numChars
= decode_utf16(tmp
, ch
);
1448 if ( numChars
== wxCONV_FAILED
)
1449 return wxCONV_FAILED
;
1451 if ( numChars
== 2 )
1454 if ( ++outLen
> dstLen
)
1455 return wxCONV_FAILED
;
1465 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1466 const wchar_t *src
, size_t srcLen
) const
1468 if ( srcLen
== wxNO_LEN
)
1469 srcLen
= wxWcslen(src
) + 1;
1472 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1473 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1476 const size_t numChars
= encode_utf16(*src
, cc
);
1477 if ( numChars
== wxCONV_FAILED
)
1478 return wxCONV_FAILED
;
1480 outLen
+= numChars
* BYTES_PER_CHAR
;
1483 if ( outLen
> dstLen
)
1484 return wxCONV_FAILED
;
1486 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1487 if ( numChars
== 2 )
1489 // second character of a surrogate
1490 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1498 #endif // WC_UTF16/!WC_UTF16
1501 // ============================================================================
1503 // ============================================================================
1505 #ifdef WORDS_BIGENDIAN
1506 #define wxMBConvUTF32straight wxMBConvUTF32BE
1507 #define wxMBConvUTF32swap wxMBConvUTF32LE
1509 #define wxMBConvUTF32swap wxMBConvUTF32BE
1510 #define wxMBConvUTF32straight wxMBConvUTF32LE
1514 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1515 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1518 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1520 if ( srcLen
== wxNO_LEN
)
1522 // count the number of bytes in input, including the trailing NULs
1523 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1524 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1527 srcLen
*= BYTES_PER_CHAR
;
1529 else // we already have the length
1531 // we can only convert an entire number of UTF-32 characters
1532 if ( srcLen
% BYTES_PER_CHAR
)
1533 return wxCONV_FAILED
;
1539 // case when in-memory representation is UTF-16
1542 // ----------------------------------------------------------------------------
1543 // conversions without endianness change
1544 // ----------------------------------------------------------------------------
1547 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1548 const char *src
, size_t srcLen
) const
1550 srcLen
= GetLength(src
, srcLen
);
1551 if ( srcLen
== wxNO_LEN
)
1552 return wxCONV_FAILED
;
1554 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1555 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1557 for ( size_t n
= 0; n
< inLen
; n
++ )
1560 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1561 if ( numChars
== wxCONV_FAILED
)
1562 return wxCONV_FAILED
;
1567 if ( outLen
> dstLen
)
1568 return wxCONV_FAILED
;
1571 if ( numChars
== 2 )
1573 // second character of a surrogate
1583 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1584 const wchar_t *src
, size_t srcLen
) const
1586 if ( srcLen
== wxNO_LEN
)
1587 srcLen
= wxWcslen(src
) + 1;
1591 // optimization: return maximal space which could be needed for this
1592 // string instead of the exact amount which could be less if there are
1593 // any surrogates in the input
1595 // we consider that surrogates are rare enough to make it worthwhile to
1596 // avoid running the loop below at the cost of slightly extra memory
1598 return srcLen
* BYTES_PER_CHAR
;
1601 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1603 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1605 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1607 return wxCONV_FAILED
;
1609 outLen
+= BYTES_PER_CHAR
;
1611 if ( outLen
> dstLen
)
1612 return wxCONV_FAILED
;
1620 // ----------------------------------------------------------------------------
1621 // endian-reversing conversions
1622 // ----------------------------------------------------------------------------
1625 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1626 const char *src
, size_t srcLen
) const
1628 srcLen
= GetLength(src
, srcLen
);
1629 if ( srcLen
== wxNO_LEN
)
1630 return wxCONV_FAILED
;
1632 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1633 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1635 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1638 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1639 if ( numChars
== wxCONV_FAILED
)
1640 return wxCONV_FAILED
;
1645 if ( outLen
> dstLen
)
1646 return wxCONV_FAILED
;
1649 if ( numChars
== 2 )
1651 // second character of a surrogate
1661 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1662 const wchar_t *src
, size_t srcLen
) const
1664 if ( srcLen
== wxNO_LEN
)
1665 srcLen
= wxWcslen(src
) + 1;
1669 // optimization: return maximal space which could be needed for this
1670 // string instead of the exact amount which could be less if there are
1671 // any surrogates in the input
1673 // we consider that surrogates are rare enough to make it worthwhile to
1674 // avoid running the loop below at the cost of slightly extra memory
1676 return srcLen
*BYTES_PER_CHAR
;
1679 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1681 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1683 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1685 return wxCONV_FAILED
;
1687 outLen
+= BYTES_PER_CHAR
;
1689 if ( outLen
> dstLen
)
1690 return wxCONV_FAILED
;
1692 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1698 #else // !WC_UTF16: wchar_t is UTF-32
1700 // ----------------------------------------------------------------------------
1701 // conversions without endianness change
1702 // ----------------------------------------------------------------------------
1705 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1706 const char *src
, size_t srcLen
) const
1708 // use memcpy() as it should be much faster than hand-written loop
1709 srcLen
= GetLength(src
, srcLen
);
1710 if ( srcLen
== wxNO_LEN
)
1711 return wxCONV_FAILED
;
1713 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1716 if ( dstLen
< inLen
)
1717 return wxCONV_FAILED
;
1719 memcpy(dst
, src
, srcLen
);
1726 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1727 const wchar_t *src
, size_t srcLen
) const
1729 if ( srcLen
== wxNO_LEN
)
1730 srcLen
= wxWcslen(src
) + 1;
1732 srcLen
*= BYTES_PER_CHAR
;
1736 if ( dstLen
< srcLen
)
1737 return wxCONV_FAILED
;
1739 memcpy(dst
, src
, srcLen
);
1745 // ----------------------------------------------------------------------------
1746 // endian-reversing conversions
1747 // ----------------------------------------------------------------------------
1750 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1751 const char *src
, size_t srcLen
) const
1753 srcLen
= GetLength(src
, srcLen
);
1754 if ( srcLen
== wxNO_LEN
)
1755 return wxCONV_FAILED
;
1757 srcLen
/= BYTES_PER_CHAR
;
1761 if ( dstLen
< srcLen
)
1762 return wxCONV_FAILED
;
1764 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1765 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1767 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1775 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1776 const wchar_t *src
, size_t srcLen
) const
1778 if ( srcLen
== wxNO_LEN
)
1779 srcLen
= wxWcslen(src
) + 1;
1781 srcLen
*= BYTES_PER_CHAR
;
1785 if ( dstLen
< srcLen
)
1786 return wxCONV_FAILED
;
1788 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1789 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1791 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1798 #endif // WC_UTF16/!WC_UTF16
1801 // ============================================================================
1802 // The classes doing conversion using the iconv_xxx() functions
1803 // ============================================================================
1807 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1808 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1809 // (unless there's yet another bug in glibc) the only case when iconv()
1810 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1811 // left in the input buffer -- when _real_ error occurs,
1812 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1814 // [This bug does not appear in glibc 2.2.]
1815 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1816 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1817 (errno != E2BIG || bufLeft != 0))
1819 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1822 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1824 #define ICONV_T_INVALID ((iconv_t)-1)
1826 #if SIZEOF_WCHAR_T == 4
1827 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1828 #define WC_ENC wxFONTENCODING_UTF32
1829 #elif SIZEOF_WCHAR_T == 2
1830 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1831 #define WC_ENC wxFONTENCODING_UTF16
1832 #else // sizeof(wchar_t) != 2 nor 4
1833 // does this ever happen?
1834 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1837 // ----------------------------------------------------------------------------
1838 // wxMBConv_iconv: encapsulates an iconv character set
1839 // ----------------------------------------------------------------------------
1841 class wxMBConv_iconv
: public wxMBConv
1844 wxMBConv_iconv(const char *name
);
1845 virtual ~wxMBConv_iconv();
1847 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1848 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1850 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1851 virtual size_t GetMBNulLen() const;
1853 #if wxUSE_UNICODE_UTF8
1854 virtual bool IsUTF8() const;
1857 virtual wxMBConv
*Clone() const
1859 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
1860 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1865 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1868 // the iconv handlers used to translate from multibyte
1869 // to wide char and in the other direction
1874 // guards access to m2w and w2m objects
1875 wxMutex m_iconvMutex
;
1879 // the name (for iconv_open()) of a wide char charset -- if none is
1880 // available on this machine, it will remain NULL
1881 static wxString ms_wcCharsetName
;
1883 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1884 // different endian-ness than the native one
1885 static bool ms_wcNeedsSwap
;
1888 // name of the encoding handled by this conversion
1891 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1893 size_t m_minMBCharWidth
;
1896 // make the constructor available for unit testing
1897 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
1899 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1900 if ( !result
->IsOk() )
1909 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1910 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1912 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
1915 m_minMBCharWidth
= 0;
1917 // check for charset that represents wchar_t:
1918 if ( ms_wcCharsetName
.empty() )
1920 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1923 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1924 #else // !wxUSE_FONTMAP
1925 static const wxChar
*names_static
[] =
1927 #if SIZEOF_WCHAR_T == 4
1929 #elif SIZEOF_WCHAR_T = 2
1934 const wxChar
**names
= names_static
;
1935 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1937 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1939 const wxString
nameCS(*names
);
1941 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1942 wxString
nameXE(nameCS
);
1944 #ifdef WORDS_BIGENDIAN
1946 #else // little endian
1950 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1953 m2w
= iconv_open(nameXE
.ToAscii(), name
);
1954 if ( m2w
== ICONV_T_INVALID
)
1956 // try charset w/o bytesex info (e.g. "UCS4")
1957 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1959 m2w
= iconv_open(nameCS
.ToAscii(), name
);
1961 // and check for bytesex ourselves:
1962 if ( m2w
!= ICONV_T_INVALID
)
1964 char buf
[2], *bufPtr
;
1965 wchar_t wbuf
[2], *wbufPtr
;
1973 outsz
= SIZEOF_WCHAR_T
* 2;
1978 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
1979 (char**)&wbufPtr
, &outsz
);
1981 if (ICONV_FAILED(res
, insz
))
1983 wxLogLastError(wxT("iconv"));
1984 wxLogError(_("Conversion to charset '%s' doesn't work."),
1987 else // ok, can convert to this encoding, remember it
1989 ms_wcCharsetName
= nameCS
;
1990 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
1994 else // use charset not requiring byte swapping
1996 ms_wcCharsetName
= nameXE
;
2000 wxLogTrace(TRACE_STRCONV
,
2001 wxT("iconv wchar_t charset is \"%s\"%s"),
2002 ms_wcCharsetName
.empty() ? wxString("<none>")
2004 ms_wcNeedsSwap
? _T(" (needs swap)")
2007 else // we already have ms_wcCharsetName
2009 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2012 if ( ms_wcCharsetName
.empty() )
2014 w2m
= ICONV_T_INVALID
;
2018 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2019 if ( w2m
== ICONV_T_INVALID
)
2021 wxLogTrace(TRACE_STRCONV
,
2022 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2023 ms_wcCharsetName
.c_str(), name
);
2028 wxMBConv_iconv::~wxMBConv_iconv()
2030 if ( m2w
!= ICONV_T_INVALID
)
2032 if ( w2m
!= ICONV_T_INVALID
)
2036 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2038 // find the string length: notice that must be done differently for
2039 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2041 const size_t nulLen
= GetMBNulLen();
2045 return wxCONV_FAILED
;
2048 inbuf
= strlen(psz
); // arguably more optimized than our version
2053 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2054 // they also have to start at character boundary and not span two
2055 // adjacent characters
2057 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2064 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2065 // Unfortunately there are a couple of global wxCSConv objects such as
2066 // wxConvLocal that are used all over wx code, so we have to make sure
2067 // the handle is used by at most one thread at the time. Otherwise
2068 // only a few wx classes would be safe to use from non-main threads
2069 // as MB<->WC conversion would fail "randomly".
2070 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2071 #endif // wxUSE_THREADS
2073 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
2075 // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2076 wchar_t *bufPtr
= buf
;
2077 const char *pszPtr
= psz
;
2081 // have destination buffer, convert there
2083 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
2084 (char**)&bufPtr
, &outbuf
);
2085 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
2089 // convert to native endianness
2090 for ( unsigned i
= 0; i
< res
; i
++ )
2091 buf
[n
] = WC_BSWAP(buf
[i
]);
2094 // NUL-terminate the string if there is any space left
2100 // no destination buffer... convert using temp buffer
2101 // to calculate destination buffer requirement
2108 outbuf
= 8 * SIZEOF_WCHAR_T
;
2111 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
2112 (char**)&bufPtr
, &outbuf
);
2114 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
2116 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2119 if (ICONV_FAILED(cres
, inbuf
))
2121 //VS: it is ok if iconv fails, hence trace only
2122 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2123 return wxCONV_FAILED
;
2129 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2132 // NB: explained in MB2WC
2133 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2136 size_t inlen
= wxWcslen(psz
);
2137 size_t inbuf
= inlen
* SIZEOF_WCHAR_T
;
2141 wchar_t *tmpbuf
= 0;
2145 // need to copy to temp buffer to switch endianness
2146 // (doing WC_BSWAP twice on the original buffer won't help, as it
2147 // could be in read-only memory, or be accessed in some other thread)
2148 tmpbuf
= (wchar_t *)malloc(inbuf
+ SIZEOF_WCHAR_T
);
2149 for ( size_t i
= 0; i
< inlen
; i
++ )
2150 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
2152 tmpbuf
[inlen
] = L
'\0';
2158 // have destination buffer, convert there
2159 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
2163 // NB: iconv was given only wcslen(psz) characters on input, and so
2164 // it couldn't convert the trailing zero. Let's do it ourselves
2165 // if there's some room left for it in the output buffer.
2171 // no destination buffer: convert using temp buffer
2172 // to calculate destination buffer requirement
2180 cres
= iconv( w2m
, ICONV_CHAR_CAST(&psz
), &inbuf
, &buf
, &outbuf
);
2184 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2192 if (ICONV_FAILED(cres
, inbuf
))
2194 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2195 return wxCONV_FAILED
;
2201 size_t wxMBConv_iconv::GetMBNulLen() const
2203 if ( m_minMBCharWidth
== 0 )
2205 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2208 // NB: explained in MB2WC
2209 wxMutexLocker
lock(self
->m_iconvMutex
);
2212 const wchar_t *wnul
= L
"";
2213 char buf
[8]; // should be enough for NUL in any encoding
2214 size_t inLen
= sizeof(wchar_t),
2215 outLen
= WXSIZEOF(buf
);
2216 char *inBuff
= (char *)wnul
;
2217 char *outBuff
= buf
;
2218 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2220 self
->m_minMBCharWidth
= (size_t)-1;
2224 self
->m_minMBCharWidth
= outBuff
- buf
;
2228 return m_minMBCharWidth
;
2231 #if wxUSE_UNICODE_UTF8
2232 bool wxMBConv_iconv::IsUTF8() const
2234 return wxStricmp(m_name
, "UTF-8") == 0 ||
2235 wxStricmp(m_name
, "UTF8") == 0;
2239 #endif // HAVE_ICONV
2242 // ============================================================================
2243 // Win32 conversion classes
2244 // ============================================================================
2246 #ifdef wxHAVE_WIN32_MB2WC
2250 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2251 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2254 class wxMBConv_win32
: public wxMBConv
2259 m_CodePage
= CP_ACP
;
2260 m_minMBCharWidth
= 0;
2263 wxMBConv_win32(const wxMBConv_win32
& conv
)
2266 m_CodePage
= conv
.m_CodePage
;
2267 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2271 wxMBConv_win32(const char* name
)
2273 m_CodePage
= wxCharsetToCodepage(name
);
2274 m_minMBCharWidth
= 0;
2277 wxMBConv_win32(wxFontEncoding encoding
)
2279 m_CodePage
= wxEncodingToCodepage(encoding
);
2280 m_minMBCharWidth
= 0;
2282 #endif // wxUSE_FONTMAP
2284 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2286 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2287 // the behaviour is not compatible with the Unix version (using iconv)
2288 // and break the library itself, e.g. wxTextInputStream::NextChar()
2289 // wouldn't work if reading an incomplete MB char didn't result in an
2292 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2293 // Win XP or newer and it is not supported for UTF-[78] so we always
2294 // use our own conversions in this case. See
2295 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2296 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2297 if ( m_CodePage
== CP_UTF8
)
2299 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2302 if ( m_CodePage
== CP_UTF7
)
2304 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2308 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2309 IsAtLeastWin2kSP4() )
2311 flags
= MB_ERR_INVALID_CHARS
;
2314 const size_t len
= ::MultiByteToWideChar
2316 m_CodePage
, // code page
2317 flags
, // flags: fall on error
2318 psz
, // input string
2319 -1, // its length (NUL-terminated)
2320 buf
, // output string
2321 buf
? n
: 0 // size of output buffer
2325 // function totally failed
2326 return wxCONV_FAILED
;
2329 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2330 // check if we succeeded, by doing a double trip:
2331 if ( !flags
&& buf
)
2333 const size_t mbLen
= strlen(psz
);
2334 wxCharBuffer
mbBuf(mbLen
);
2335 if ( ::WideCharToMultiByte
2342 mbLen
+ 1, // size in bytes, not length
2346 strcmp(mbBuf
, psz
) != 0 )
2348 // we didn't obtain the same thing we started from, hence
2349 // the conversion was lossy and we consider that it failed
2350 return wxCONV_FAILED
;
2354 // note that it returns count of written chars for buf != NULL and size
2355 // of the needed buffer for buf == NULL so in either case the length of
2356 // the string (which never includes the terminating NUL) is one less
2360 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2363 we have a problem here: by default, WideCharToMultiByte() may
2364 replace characters unrepresentable in the target code page with bad
2365 quality approximations such as turning "1/2" symbol (U+00BD) into
2366 "1" for the code pages which don't have it and we, obviously, want
2367 to avoid this at any price
2369 the trouble is that this function does it _silently_, i.e. it won't
2370 even tell us whether it did or not... Win98/2000 and higher provide
2371 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2372 we have to resort to a round trip, i.e. check that converting back
2373 results in the same string -- this is, of course, expensive but
2374 otherwise we simply can't be sure to not garble the data.
2377 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2378 // it doesn't work with CJK encodings (which we test for rather roughly
2379 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2381 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2384 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2386 // it's our lucky day
2387 flags
= WC_NO_BEST_FIT_CHARS
;
2388 pUsedDef
= &usedDef
;
2390 else // old system or unsupported encoding
2396 const size_t len
= ::WideCharToMultiByte
2398 m_CodePage
, // code page
2399 flags
, // either none or no best fit
2400 pwz
, // input string
2401 -1, // it is (wide) NUL-terminated
2402 buf
, // output buffer
2403 buf
? n
: 0, // and its size
2404 NULL
, // default "replacement" char
2405 pUsedDef
// [out] was it used?
2410 // function totally failed
2411 return wxCONV_FAILED
;
2414 // if we were really converting, check if we succeeded
2419 // check if the conversion failed, i.e. if any replacements
2422 return wxCONV_FAILED
;
2424 else // we must resort to double tripping...
2426 wxWCharBuffer
wcBuf(n
);
2427 if ( MB2WC(wcBuf
.data(), buf
, n
) == wxCONV_FAILED
||
2428 wcscmp(wcBuf
, pwz
) != 0 )
2430 // we didn't obtain the same thing we started from, hence
2431 // the conversion was lossy and we consider that it failed
2432 return wxCONV_FAILED
;
2437 // see the comment above for the reason of "len - 1"
2441 virtual size_t GetMBNulLen() const
2443 if ( m_minMBCharWidth
== 0 )
2445 int len
= ::WideCharToMultiByte
2447 m_CodePage
, // code page
2449 L
"", // input string
2450 1, // translate just the NUL
2451 NULL
, // output buffer
2453 NULL
, // no replacement char
2454 NULL
// [out] don't care if it was used
2457 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2461 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2462 self
->m_minMBCharWidth
= (size_t)-1;
2466 self
->m_minMBCharWidth
= (size_t)-1;
2472 self
->m_minMBCharWidth
= len
;
2477 return m_minMBCharWidth
;
2480 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2482 bool IsOk() const { return m_CodePage
!= -1; }
2485 static bool CanUseNoBestFit()
2487 static int s_isWin98Or2k
= -1;
2489 if ( s_isWin98Or2k
== -1 )
2492 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2494 case wxOS_WINDOWS_9X
:
2495 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2498 case wxOS_WINDOWS_NT
:
2499 s_isWin98Or2k
= verMaj
>= 5;
2503 // unknown: be conservative by default
2508 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2511 return s_isWin98Or2k
== 1;
2514 static bool IsAtLeastWin2kSP4()
2519 static int s_isAtLeastWin2kSP4
= -1;
2521 if ( s_isAtLeastWin2kSP4
== -1 )
2523 OSVERSIONINFOEX ver
;
2525 memset(&ver
, 0, sizeof(ver
));
2526 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2527 GetVersionEx((OSVERSIONINFO
*)&ver
);
2529 s_isAtLeastWin2kSP4
=
2530 ((ver
.dwMajorVersion
> 5) || // Vista+
2531 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2532 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2533 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2537 return s_isAtLeastWin2kSP4
== 1;
2542 // the code page we're working with
2545 // cached result of GetMBNulLen(), set to 0 initially meaning
2547 size_t m_minMBCharWidth
;
2550 #endif // wxHAVE_WIN32_MB2WC
2553 // ============================================================================
2554 // wxEncodingConverter based conversion classes
2555 // ============================================================================
2559 class wxMBConv_wxwin
: public wxMBConv
2564 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2565 // The wxMBConv_cf class does a better job.
2566 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2567 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2568 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2572 // temporarily just use wxEncodingConverter stuff,
2573 // so that it works while a better implementation is built
2574 wxMBConv_wxwin(const char* name
)
2577 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2579 m_enc
= wxFONTENCODING_SYSTEM
;
2584 wxMBConv_wxwin(wxFontEncoding enc
)
2591 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2593 size_t inbuf
= strlen(psz
);
2596 if (!m2w
.Convert(psz
, buf
))
2597 return wxCONV_FAILED
;
2602 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2604 const size_t inbuf
= wxWcslen(psz
);
2607 if (!w2m
.Convert(psz
, buf
))
2608 return wxCONV_FAILED
;
2614 virtual size_t GetMBNulLen() const
2618 case wxFONTENCODING_UTF16BE
:
2619 case wxFONTENCODING_UTF16LE
:
2622 case wxFONTENCODING_UTF32BE
:
2623 case wxFONTENCODING_UTF32LE
:
2631 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2633 bool IsOk() const { return m_ok
; }
2636 wxFontEncoding m_enc
;
2637 wxEncodingConverter m2w
, w2m
;
2640 // were we initialized successfully?
2643 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2646 // make the constructors available for unit testing
2647 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2649 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2650 if ( !result
->IsOk() )
2659 #endif // wxUSE_FONTMAP
2661 // ============================================================================
2662 // wxCSConv implementation
2663 // ============================================================================
2665 void wxCSConv::Init()
2672 wxCSConv::wxCSConv(const wxString
& charset
)
2676 if ( !charset
.empty() )
2678 SetName(charset
.ToAscii());
2682 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2684 m_encoding
= wxFONTENCODING_SYSTEM
;
2688 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2690 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2692 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2694 encoding
= wxFONTENCODING_SYSTEM
;
2699 m_encoding
= encoding
;
2702 wxCSConv::~wxCSConv()
2707 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2712 SetName(conv
.m_name
);
2713 m_encoding
= conv
.m_encoding
;
2716 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2720 SetName(conv
.m_name
);
2721 m_encoding
= conv
.m_encoding
;
2726 void wxCSConv::Clear()
2735 void wxCSConv::SetName(const char *charset
)
2739 m_name
= wxStrdup(charset
);
2746 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2747 wxEncodingNameCache
);
2749 static wxEncodingNameCache gs_nameCache
;
2752 wxMBConv
*wxCSConv::DoCreate() const
2755 wxLogTrace(TRACE_STRCONV
,
2756 wxT("creating conversion for %s"),
2758 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2759 #endif // wxUSE_FONTMAP
2761 // check for the special case of ASCII or ISO8859-1 charset: as we have
2762 // special knowledge of it anyhow, we don't need to create a special
2763 // conversion object
2764 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2765 m_encoding
== wxFONTENCODING_DEFAULT
)
2767 // don't convert at all
2771 // we trust OS to do conversion better than we can so try external
2772 // conversion methods first
2774 // the full order is:
2775 // 1. OS conversion (iconv() under Unix or Win32 API)
2776 // 2. hard coded conversions for UTF
2777 // 3. wxEncodingConverter as fall back
2783 #endif // !wxUSE_FONTMAP
2786 wxFontEncoding
encoding(m_encoding
);
2791 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2799 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2800 #endif // wxUSE_FONTMAP
2804 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2805 if ( it
!= gs_nameCache
.end() )
2807 if ( it
->second
.empty() )
2810 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
2817 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2818 // CS : in case this does not return valid names (eg for MacRoman)
2819 // encoding got a 'failure' entry in the cache all the same,
2820 // although it just has to be created using a different method, so
2821 // only store failed iconv creation attempts (or perhaps we
2822 // shoulnd't do this at all ?)
2823 if ( names
[0] != NULL
)
2825 for ( ; *names
; ++names
)
2827 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2828 // will need changes that will obsolete this
2829 wxString
name(*names
);
2830 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
2833 gs_nameCache
[encoding
] = *names
;
2840 gs_nameCache
[encoding
] = _T(""); // cache the failure
2843 #endif // wxUSE_FONTMAP
2845 #endif // HAVE_ICONV
2847 #ifdef wxHAVE_WIN32_MB2WC
2850 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2851 : new wxMBConv_win32(m_encoding
);
2860 #endif // wxHAVE_WIN32_MB2WC
2864 // leave UTF16 and UTF32 to the built-ins of wx
2865 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2866 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2869 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
2870 : new wxMBConv_cf(m_encoding
);
2872 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
2881 #endif // __DARWIN__
2884 wxFontEncoding enc
= m_encoding
;
2886 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2888 // use "false" to suppress interactive dialogs -- we can be called from
2889 // anywhere and popping up a dialog from here is the last thing we want to
2891 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2893 #endif // wxUSE_FONTMAP
2897 case wxFONTENCODING_UTF7
:
2898 return new wxMBConvUTF7
;
2900 case wxFONTENCODING_UTF8
:
2901 return new wxMBConvUTF8
;
2903 case wxFONTENCODING_UTF16BE
:
2904 return new wxMBConvUTF16BE
;
2906 case wxFONTENCODING_UTF16LE
:
2907 return new wxMBConvUTF16LE
;
2909 case wxFONTENCODING_UTF32BE
:
2910 return new wxMBConvUTF32BE
;
2912 case wxFONTENCODING_UTF32LE
:
2913 return new wxMBConvUTF32LE
;
2916 // nothing to do but put here to suppress gcc warnings
2923 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2924 : new wxMBConv_wxwin(m_encoding
);
2930 #endif // wxUSE_FONTMAP
2932 // NB: This is a hack to prevent deadlock. What could otherwise happen
2933 // in Unicode build: wxConvLocal creation ends up being here
2934 // because of some failure and logs the error. But wxLog will try to
2935 // attach a timestamp, for which it will need wxConvLocal (to convert
2936 // time to char* and then wchar_t*), but that fails, tries to log the
2937 // error, but wxLog has an (already locked) critical section that
2938 // guards the static buffer.
2939 static bool alreadyLoggingError
= false;
2940 if (!alreadyLoggingError
)
2942 alreadyLoggingError
= true;
2943 wxLogError(_("Cannot convert from the charset '%s'!"),
2947 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
2948 #else // !wxUSE_FONTMAP
2949 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
2950 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2953 alreadyLoggingError
= false;
2959 void wxCSConv::CreateConvIfNeeded() const
2963 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
2965 // if we don't have neither the name nor the encoding, use the default
2966 // encoding for this system
2967 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
2970 self
->m_encoding
= wxLocale::GetSystemEncoding();
2972 // fallback to some reasonable default:
2973 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
2974 #endif // wxUSE_INTL
2977 self
->m_convReal
= DoCreate();
2978 self
->m_deferred
= false;
2982 bool wxCSConv::IsOk() const
2984 CreateConvIfNeeded();
2986 // special case: no convReal created for wxFONTENCODING_ISO8859_1
2987 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
2988 return true; // always ok as we do it ourselves
2990 // m_convReal->IsOk() is called at its own creation, so we know it must
2991 // be ok if m_convReal is non-NULL
2992 return m_convReal
!= NULL
;
2995 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
2996 const char *src
, size_t srcLen
) const
2998 CreateConvIfNeeded();
3001 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3004 return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
);
3007 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3008 const wchar_t *src
, size_t srcLen
) const
3010 CreateConvIfNeeded();
3013 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3016 return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
);
3019 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3021 CreateConvIfNeeded();
3024 return m_convReal
->MB2WC(buf
, psz
, n
);
3027 size_t len
= strlen(psz
);
3031 for (size_t c
= 0; c
<= len
; c
++)
3032 buf
[c
] = (unsigned char)(psz
[c
]);
3038 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3040 CreateConvIfNeeded();
3043 return m_convReal
->WC2MB(buf
, psz
, n
);
3046 const size_t len
= wxWcslen(psz
);
3049 for (size_t c
= 0; c
<= len
; c
++)
3052 return wxCONV_FAILED
;
3054 buf
[c
] = (char)psz
[c
];
3059 for (size_t c
= 0; c
<= len
; c
++)
3062 return wxCONV_FAILED
;
3069 size_t wxCSConv::GetMBNulLen() const
3071 CreateConvIfNeeded();
3075 return m_convReal
->GetMBNulLen();
3078 // otherwise, we are ISO-8859-1
3082 #if wxUSE_UNICODE_UTF8
3083 bool wxCSConv::IsUTF8() const
3085 CreateConvIfNeeded();
3089 return m_convReal
->IsUTF8();
3092 // otherwise, we are ISO-8859-1
3100 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3103 return wxWCharBuffer();
3105 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3107 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3109 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3114 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3117 return wxCharBuffer();
3119 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3121 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3126 #endif // wxUSE_UNICODE
3128 // ----------------------------------------------------------------------------
3130 // ----------------------------------------------------------------------------
3132 // NB: The reason why we create converted objects in this convoluted way,
3133 // using a factory function instead of global variable, is that they
3134 // may be used at static initialization time (some of them are used by
3135 // wxString ctors and there may be a global wxString object). In other
3136 // words, possibly _before_ the converter global object would be
3143 #undef wxConvISO8859_1
3145 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3146 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3147 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3149 static impl_klass name##Obj ctor_args; \
3150 return &name##Obj; \
3152 /* this ensures that all global converter objects are created */ \
3153 /* by the time static initialization is done, i.e. before any */ \
3154 /* thread is launched: */ \
3155 static klass* gs_##name##instance = wxGet_##name##Ptr()
3157 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3158 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3161 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3163 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3166 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, wxEMPTY_PARAMETER_VALUE
);
3167 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, wxEMPTY_PARAMETER_VALUE
);
3169 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3170 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3172 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3173 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3176 // The xnu kernel always communicates file paths in decomposed UTF-8.
3177 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3178 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3181 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3184 #else // !__DARWIN__
3185 wxGet_wxConvLibcPtr();
3186 #endif // __DARWIN__/!__DARWIN__
3188 #else // !wxUSE_WCHAR_T
3190 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3191 // stand-ins in absence of wchar_t
3192 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3197 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T