1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/mac/corefoundation/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV _T("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten
= 0;
172 // the number of NULs terminating this string
173 size_t nulLen
= 0; // not really needed, but just to avoid warnings
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
181 if ( srcLen
!= wxNO_LEN
)
183 // we need to know how to find the end of this string
184 nulLen
= GetMBNulLen();
185 if ( nulLen
== wxCONV_FAILED
)
186 return wxCONV_FAILED
;
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
193 char * const p
= bufTmp
.data();
194 memcpy(p
, src
, srcLen
);
195 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
201 srcEnd
= src
+ srcLen
;
203 else // quit after the first loop iteration
210 // try to convert the current chunk
211 size_t lenChunk
= MB2WC(NULL
, src
, 0);
212 if ( lenChunk
== wxCONV_FAILED
)
213 return wxCONV_FAILED
;
215 lenChunk
++; // for the L'\0' at the end of this chunk
217 dstWritten
+= lenChunk
;
221 // nothing left in the input string, conversion succeeded
227 if ( dstWritten
> dstLen
)
228 return wxCONV_FAILED
;
230 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
231 return wxCONV_FAILED
;
238 // we convert just one chunk in this case as this is the entire
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src
, nulLen
) )
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
253 src
+= nulLen
; // skipping over its terminator as well
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
266 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
267 const wchar_t *src
, size_t srcLen
) const
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten
= 0;
272 // make a copy of the input string unless it is already properly
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp
;
278 if ( srcLen
== wxNO_LEN
)
280 srcLen
= wxWcslen(src
) + 1;
282 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp
= wxWCharBuffer(srcLen
);
286 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
290 const size_t lenNul
= GetMBNulLen();
291 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
293 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
295 // try to convert the current chunk
296 size_t lenChunk
= WC2MB(NULL
, src
, 0);
298 if ( lenChunk
== wxCONV_FAILED
)
299 return wxCONV_FAILED
;
302 dstWritten
+= lenChunk
;
306 if ( dstWritten
> dstLen
)
307 return wxCONV_FAILED
;
309 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
310 return wxCONV_FAILED
;
319 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
321 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
322 if ( rc
!= wxCONV_FAILED
)
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
332 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
334 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
335 if ( rc
!= wxCONV_FAILED
)
343 wxMBConv::~wxMBConv()
345 // nothing to do here (necessary for Darwin linking probably)
348 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
352 // calculate the length of the buffer needed first
353 const size_t nLen
= ToWChar(NULL
, 0, psz
);
354 if ( nLen
!= wxCONV_FAILED
)
356 // now do the actual conversion
357 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
359 // +1 for the trailing NULL
360 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
365 return wxWCharBuffer();
368 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
372 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
373 if ( nLen
!= wxCONV_FAILED
)
375 wxCharBuffer
buf(nLen
- 1);
376 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
381 return wxCharBuffer();
385 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
387 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
388 if ( dstLen
!= wxCONV_FAILED
)
390 // notice that we allocate space for dstLen+1 wide characters here
391 // because we want the buffer to always be NUL-terminated, even if the
392 // input isn't (as otherwise the caller has no way to know its length)
393 wxWCharBuffer
wbuf(dstLen
);
394 wbuf
.data()[dstLen
- 1] = L
'\0';
395 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
400 if ( wbuf
[dstLen
- 1] == L
'\0' )
411 return wxWCharBuffer();
415 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
417 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
418 if ( dstLen
!= wxCONV_FAILED
)
420 const size_t nulLen
= GetMBNulLen();
422 // as above, ensure that the buffer is always NUL-terminated, even if
424 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
425 memset(buf
.data() + dstLen
, 0, nulLen
);
426 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
432 if ( dstLen
>= nulLen
&&
433 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
435 // in this case the output is NUL-terminated and we're not
436 // supposed to count NUL
448 return wxCharBuffer();
451 // ----------------------------------------------------------------------------
453 // ----------------------------------------------------------------------------
455 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
457 return wxMB2WC(buf
, psz
, n
);
460 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
462 return wxWC2MB(buf
, psz
, n
);
465 // ----------------------------------------------------------------------------
466 // wxConvBrokenFileNames
467 // ----------------------------------------------------------------------------
471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
473 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
474 wxStricmp(charset
, _T("UTF8")) == 0 )
475 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
477 m_conv
= new wxCSConv(charset
);
482 // ----------------------------------------------------------------------------
484 // ----------------------------------------------------------------------------
486 // Implementation (C) 2004 Fredrik Roubert
489 // BASE64 decoding table
491 static const unsigned char utf7unb64
[] =
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
499 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
500 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
502 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
503 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
504 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
506 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
507 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
508 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
527 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
531 while ( *psz
&& (!buf
|| (len
< n
)) )
533 unsigned char cc
= *psz
++;
541 else if (*psz
== '-')
549 else // start of BASE64 encoded string
553 for ( ok
= lsb
= false, d
= 0, l
= 0;
554 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
559 for (l
+= 6; l
>= 8; lsb
= !lsb
)
561 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
571 *buf
= (wchar_t)(c
<< 8);
580 // in valid UTF7 we should have valid characters after '+'
581 return wxCONV_FAILED
;
589 if ( buf
&& (len
< n
) )
596 // BASE64 encoding table
598 static const unsigned char utf7enb64
[] =
600 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
601 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
602 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
603 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
604 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
605 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
606 'w', 'x', 'y', 'z', '0', '1', '2', '3',
607 '4', '5', '6', '7', '8', '9', '+', '/'
611 // UTF-7 encoding table
613 // 0 - Set D (directly encoded characters)
614 // 1 - Set O (optional direct characters)
615 // 2 - whitespace characters (optional)
616 // 3 - special characters
618 static const unsigned char utf7encode
[128] =
620 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
621 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
622 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
624 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
626 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
630 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
634 while (*psz
&& ((!buf
) || (len
< n
)))
637 if (cc
< 0x80 && utf7encode
[cc
] < 1)
646 else if (((wxUint32
)cc
) > 0xffff)
648 // no surrogate pair generation (yet?)
649 return wxCONV_FAILED
;
660 // BASE64 encode string
661 unsigned int lsb
, d
, l
;
662 for (d
= 0, l
= 0; /*nothing*/; psz
++)
664 for (lsb
= 0; lsb
< 2; lsb
++)
667 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
669 for (l
+= 8; l
>= 6; )
673 *buf
++ = utf7enb64
[(d
>> l
) % 64];
679 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
686 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
698 if (buf
&& (len
< n
))
704 // ----------------------------------------------------------------------------
706 // ----------------------------------------------------------------------------
708 static const wxUint32 utf8_max
[]=
709 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
711 // boundaries of the private use area we use to (temporarily) remap invalid
712 // characters invalid in a UTF-8 encoded string
713 const wxUint32 wxUnicodePUA
= 0x100000;
714 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
716 // this table gives the length of the UTF-8 encoding from its first character:
717 const unsigned char tableUtf8Lengths
[256] = {
718 // single-byte sequences (ASCII):
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
726 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
728 // these are invalid:
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
735 // two-byte sequences:
736 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
737 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
739 // three-byte sequences:
740 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
742 // four-byte sequences:
743 4, 4, 4, 4, 4, // F0..F4
745 // these are invalid again (5- or 6-byte
746 // sequences and sequences for code points
747 // above U+10FFFF, as restricted by RFC 3629):
748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
752 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
753 const char *src
, size_t srcLen
) const
755 wchar_t *out
= dstLen
? dst
: NULL
;
758 if ( srcLen
== wxNO_LEN
)
759 srcLen
= strlen(src
) + 1;
761 for ( const char *p
= src
; ; p
++ )
763 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
765 // all done successfully, just add the trailing NULL if we are not
766 // using explicit length
767 if ( srcLen
== wxNO_LEN
)
783 if ( out
&& !dstLen
-- )
787 unsigned char c
= *p
;
791 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
794 if ( srcLen
!= wxNO_LEN
)
801 unsigned len
= tableUtf8Lengths
[c
];
805 if ( srcLen
< len
) // the test works for wxNO_LEN too
808 if ( srcLen
!= wxNO_LEN
)
811 // Char. number range | UTF-8 octet sequence
812 // (hexadecimal) | (binary)
813 // ----------------------+----------------------------------------
814 // 0000 0000 - 0000 007F | 0xxxxxxx
815 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
816 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
817 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
819 // Code point value is stored in bits marked with 'x',
820 // lowest-order bit of the value on the right side in the diagram
821 // above. (from RFC 3629)
823 // mask to extract lead byte's value ('x' bits above), by sequence
825 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
827 // mask and value of lead byte's most significant bits, by length:
828 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
829 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
831 len
--; // it's more convenient to work with 0-based length here
833 // extract the lead byte's value bits:
834 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
837 code
= c
& leadValueMask
[len
];
839 // all remaining bytes, if any, are handled in the same way
840 // regardless of sequence's length:
844 if ( (c
& 0xC0) != 0x80 )
845 return wxCONV_FAILED
;
853 // cast is ok because wchar_t == wxUint16 if WC_UTF16
854 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
863 #endif // WC_UTF16/!WC_UTF16
871 return wxCONV_FAILED
;
875 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
876 const wchar_t *src
, size_t srcLen
) const
878 char *out
= dstLen
? dst
: NULL
;
881 for ( const wchar_t *wp
= src
; ; wp
++ )
883 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
--) )
885 // all done successfully, just add the trailing NULL if we are not
886 // using explicit length
887 if ( srcLen
== wxNO_LEN
)
906 // cast is ok for WC_UTF16
907 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
909 // skip the next char too as we decoded a surrogate
912 #else // wchar_t is UTF-32
913 code
= *wp
& 0x7fffffff;
928 else if ( code
<= 0x07FF )
936 // NB: this line takes 6 least significant bits, encodes them as
937 // 10xxxxxx and discards them so that the next byte can be encoded:
938 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
939 out
[0] = 0xC0 | code
;
942 else if ( code
< 0xFFFF )
950 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
951 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
952 out
[0] = 0xE0 | code
;
955 else if ( code
<= 0x10FFFF )
963 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
964 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
965 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
966 out
[0] = 0xF0 | code
;
971 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
984 // we only get here if an error occurs during decoding
985 return wxCONV_FAILED
;
988 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
989 const char *psz
, size_t srcLen
) const
991 if ( m_options
== MAP_INVALID_UTF8_NOT
)
992 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
996 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
998 const char *opsz
= psz
;
999 bool invalid
= false;
1000 unsigned char cc
= *psz
++, fc
= cc
;
1002 for (cnt
= 0; fc
& 0x80; cnt
++)
1012 // escape the escape character for octal escapes
1013 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1014 && cc
== '\\' && (!buf
|| len
< n
))
1026 // invalid UTF-8 sequence
1031 unsigned ocnt
= cnt
- 1;
1032 wxUint32 res
= cc
& (0x3f >> cnt
);
1036 if ((cc
& 0xC0) != 0x80)
1038 // invalid UTF-8 sequence
1044 res
= (res
<< 6) | (cc
& 0x3f);
1047 if (invalid
|| res
<= utf8_max
[ocnt
])
1049 // illegal UTF-8 encoding
1052 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1053 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1055 // if one of our PUA characters turns up externally
1056 // it must also be treated as an illegal sequence
1057 // (a bit like you have to escape an escape character)
1063 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1064 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1065 if (pa
== wxCONV_FAILED
)
1077 *buf
++ = (wchar_t)res
;
1079 #endif // WC_UTF16/!WC_UTF16
1085 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1087 while (opsz
< psz
&& (!buf
|| len
< n
))
1090 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1091 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1092 wxASSERT(pa
!= wxCONV_FAILED
);
1099 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1105 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1107 while (opsz
< psz
&& (!buf
|| len
< n
))
1109 if ( buf
&& len
+ 3 < n
)
1111 unsigned char on
= *opsz
;
1113 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1114 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1115 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1122 else // MAP_INVALID_UTF8_NOT
1124 return wxCONV_FAILED
;
1130 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1136 static inline bool isoctal(wchar_t wch
)
1138 return L
'0' <= wch
&& wch
<= L
'7';
1141 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1142 const wchar_t *psz
, size_t srcLen
) const
1144 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1145 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1149 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1154 // cast is ok for WC_UTF16
1155 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1156 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1158 cc
= (*psz
++) & 0x7fffffff;
1161 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1162 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1165 *buf
++ = (char)(cc
- wxUnicodePUA
);
1168 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1169 && cc
== L
'\\' && psz
[0] == L
'\\' )
1176 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1178 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1182 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1183 (psz
[1] - L
'0') * 010 +
1193 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1209 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1211 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1217 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1223 // ============================================================================
1225 // ============================================================================
1227 #ifdef WORDS_BIGENDIAN
1228 #define wxMBConvUTF16straight wxMBConvUTF16BE
1229 #define wxMBConvUTF16swap wxMBConvUTF16LE
1231 #define wxMBConvUTF16swap wxMBConvUTF16BE
1232 #define wxMBConvUTF16straight wxMBConvUTF16LE
1236 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1238 if ( srcLen
== wxNO_LEN
)
1240 // count the number of bytes in input, including the trailing NULs
1241 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1242 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1245 srcLen
*= BYTES_PER_CHAR
;
1247 else // we already have the length
1249 // we can only convert an entire number of UTF-16 characters
1250 if ( srcLen
% BYTES_PER_CHAR
)
1251 return wxCONV_FAILED
;
1257 // case when in-memory representation is UTF-16 too
1260 // ----------------------------------------------------------------------------
1261 // conversions without endianness change
1262 // ----------------------------------------------------------------------------
1265 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1266 const char *src
, size_t srcLen
) const
1268 // set up the scene for using memcpy() (which is presumably more efficient
1269 // than copying the bytes one by one)
1270 srcLen
= GetLength(src
, srcLen
);
1271 if ( srcLen
== wxNO_LEN
)
1272 return wxCONV_FAILED
;
1274 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1277 if ( dstLen
< inLen
)
1278 return wxCONV_FAILED
;
1280 memcpy(dst
, src
, srcLen
);
1287 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1288 const wchar_t *src
, size_t srcLen
) const
1290 if ( srcLen
== wxNO_LEN
)
1291 srcLen
= wxWcslen(src
) + 1;
1293 srcLen
*= BYTES_PER_CHAR
;
1297 if ( dstLen
< srcLen
)
1298 return wxCONV_FAILED
;
1300 memcpy(dst
, src
, srcLen
);
1306 // ----------------------------------------------------------------------------
1307 // endian-reversing conversions
1308 // ----------------------------------------------------------------------------
1311 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1312 const char *src
, size_t srcLen
) const
1314 srcLen
= GetLength(src
, srcLen
);
1315 if ( srcLen
== wxNO_LEN
)
1316 return wxCONV_FAILED
;
1318 srcLen
/= BYTES_PER_CHAR
;
1322 if ( dstLen
< srcLen
)
1323 return wxCONV_FAILED
;
1325 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1326 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1328 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1336 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1337 const wchar_t *src
, size_t srcLen
) const
1339 if ( srcLen
== wxNO_LEN
)
1340 srcLen
= wxWcslen(src
) + 1;
1342 srcLen
*= BYTES_PER_CHAR
;
1346 if ( dstLen
< srcLen
)
1347 return wxCONV_FAILED
;
1349 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1350 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1352 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1359 #else // !WC_UTF16: wchar_t is UTF-32
1361 // ----------------------------------------------------------------------------
1362 // conversions without endianness change
1363 // ----------------------------------------------------------------------------
1366 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1367 const char *src
, size_t srcLen
) const
1369 srcLen
= GetLength(src
, srcLen
);
1370 if ( srcLen
== wxNO_LEN
)
1371 return wxCONV_FAILED
;
1373 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1376 // optimization: return maximal space which could be needed for this
1377 // string even if the real size could be smaller if the buffer contains
1383 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1384 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1386 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1388 return wxCONV_FAILED
;
1390 if ( ++outLen
> dstLen
)
1391 return wxCONV_FAILED
;
1401 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1402 const wchar_t *src
, size_t srcLen
) const
1404 if ( srcLen
== wxNO_LEN
)
1405 srcLen
= wxWcslen(src
) + 1;
1408 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1409 for ( size_t n
= 0; n
< srcLen
; n
++ )
1412 const size_t numChars
= encode_utf16(*src
++, cc
);
1413 if ( numChars
== wxCONV_FAILED
)
1414 return wxCONV_FAILED
;
1416 outLen
+= numChars
* BYTES_PER_CHAR
;
1419 if ( outLen
> dstLen
)
1420 return wxCONV_FAILED
;
1423 if ( numChars
== 2 )
1425 // second character of a surrogate
1434 // ----------------------------------------------------------------------------
1435 // endian-reversing conversions
1436 // ----------------------------------------------------------------------------
1439 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1440 const char *src
, size_t srcLen
) const
1442 srcLen
= GetLength(src
, srcLen
);
1443 if ( srcLen
== wxNO_LEN
)
1444 return wxCONV_FAILED
;
1446 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1449 // optimization: return maximal space which could be needed for this
1450 // string even if the real size could be smaller if the buffer contains
1456 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1457 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1462 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1464 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1466 const size_t numChars
= decode_utf16(tmp
, ch
);
1467 if ( numChars
== wxCONV_FAILED
)
1468 return wxCONV_FAILED
;
1470 if ( numChars
== 2 )
1473 if ( ++outLen
> dstLen
)
1474 return wxCONV_FAILED
;
1484 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1485 const wchar_t *src
, size_t srcLen
) const
1487 if ( srcLen
== wxNO_LEN
)
1488 srcLen
= wxWcslen(src
) + 1;
1491 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1492 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1495 const size_t numChars
= encode_utf16(*src
, cc
);
1496 if ( numChars
== wxCONV_FAILED
)
1497 return wxCONV_FAILED
;
1499 outLen
+= numChars
* BYTES_PER_CHAR
;
1502 if ( outLen
> dstLen
)
1503 return wxCONV_FAILED
;
1505 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1506 if ( numChars
== 2 )
1508 // second character of a surrogate
1509 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1517 #endif // WC_UTF16/!WC_UTF16
1520 // ============================================================================
1522 // ============================================================================
1524 #ifdef WORDS_BIGENDIAN
1525 #define wxMBConvUTF32straight wxMBConvUTF32BE
1526 #define wxMBConvUTF32swap wxMBConvUTF32LE
1528 #define wxMBConvUTF32swap wxMBConvUTF32BE
1529 #define wxMBConvUTF32straight wxMBConvUTF32LE
1533 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1534 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1537 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1539 if ( srcLen
== wxNO_LEN
)
1541 // count the number of bytes in input, including the trailing NULs
1542 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1543 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1546 srcLen
*= BYTES_PER_CHAR
;
1548 else // we already have the length
1550 // we can only convert an entire number of UTF-32 characters
1551 if ( srcLen
% BYTES_PER_CHAR
)
1552 return wxCONV_FAILED
;
1558 // case when in-memory representation is UTF-16
1561 // ----------------------------------------------------------------------------
1562 // conversions without endianness change
1563 // ----------------------------------------------------------------------------
1566 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1567 const char *src
, size_t srcLen
) const
1569 srcLen
= GetLength(src
, srcLen
);
1570 if ( srcLen
== wxNO_LEN
)
1571 return wxCONV_FAILED
;
1573 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1574 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1576 for ( size_t n
= 0; n
< inLen
; n
++ )
1579 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1580 if ( numChars
== wxCONV_FAILED
)
1581 return wxCONV_FAILED
;
1586 if ( outLen
> dstLen
)
1587 return wxCONV_FAILED
;
1590 if ( numChars
== 2 )
1592 // second character of a surrogate
1602 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1603 const wchar_t *src
, size_t srcLen
) const
1605 if ( srcLen
== wxNO_LEN
)
1606 srcLen
= wxWcslen(src
) + 1;
1610 // optimization: return maximal space which could be needed for this
1611 // string instead of the exact amount which could be less if there are
1612 // any surrogates in the input
1614 // we consider that surrogates are rare enough to make it worthwhile to
1615 // avoid running the loop below at the cost of slightly extra memory
1617 return srcLen
* BYTES_PER_CHAR
;
1620 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1622 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1624 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1626 return wxCONV_FAILED
;
1628 outLen
+= BYTES_PER_CHAR
;
1630 if ( outLen
> dstLen
)
1631 return wxCONV_FAILED
;
1639 // ----------------------------------------------------------------------------
1640 // endian-reversing conversions
1641 // ----------------------------------------------------------------------------
1644 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1645 const char *src
, size_t srcLen
) const
1647 srcLen
= GetLength(src
, srcLen
);
1648 if ( srcLen
== wxNO_LEN
)
1649 return wxCONV_FAILED
;
1651 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1652 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1654 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1657 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1658 if ( numChars
== wxCONV_FAILED
)
1659 return wxCONV_FAILED
;
1664 if ( outLen
> dstLen
)
1665 return wxCONV_FAILED
;
1668 if ( numChars
== 2 )
1670 // second character of a surrogate
1680 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1681 const wchar_t *src
, size_t srcLen
) const
1683 if ( srcLen
== wxNO_LEN
)
1684 srcLen
= wxWcslen(src
) + 1;
1688 // optimization: return maximal space which could be needed for this
1689 // string instead of the exact amount which could be less if there are
1690 // any surrogates in the input
1692 // we consider that surrogates are rare enough to make it worthwhile to
1693 // avoid running the loop below at the cost of slightly extra memory
1695 return srcLen
*BYTES_PER_CHAR
;
1698 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1700 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1702 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1704 return wxCONV_FAILED
;
1706 outLen
+= BYTES_PER_CHAR
;
1708 if ( outLen
> dstLen
)
1709 return wxCONV_FAILED
;
1711 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1717 #else // !WC_UTF16: wchar_t is UTF-32
1719 // ----------------------------------------------------------------------------
1720 // conversions without endianness change
1721 // ----------------------------------------------------------------------------
1724 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1725 const char *src
, size_t srcLen
) const
1727 // use memcpy() as it should be much faster than hand-written loop
1728 srcLen
= GetLength(src
, srcLen
);
1729 if ( srcLen
== wxNO_LEN
)
1730 return wxCONV_FAILED
;
1732 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1735 if ( dstLen
< inLen
)
1736 return wxCONV_FAILED
;
1738 memcpy(dst
, src
, srcLen
);
1745 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1746 const wchar_t *src
, size_t srcLen
) const
1748 if ( srcLen
== wxNO_LEN
)
1749 srcLen
= wxWcslen(src
) + 1;
1751 srcLen
*= BYTES_PER_CHAR
;
1755 if ( dstLen
< srcLen
)
1756 return wxCONV_FAILED
;
1758 memcpy(dst
, src
, srcLen
);
1764 // ----------------------------------------------------------------------------
1765 // endian-reversing conversions
1766 // ----------------------------------------------------------------------------
1769 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1770 const char *src
, size_t srcLen
) const
1772 srcLen
= GetLength(src
, srcLen
);
1773 if ( srcLen
== wxNO_LEN
)
1774 return wxCONV_FAILED
;
1776 srcLen
/= BYTES_PER_CHAR
;
1780 if ( dstLen
< srcLen
)
1781 return wxCONV_FAILED
;
1783 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1784 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1786 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1794 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1795 const wchar_t *src
, size_t srcLen
) const
1797 if ( srcLen
== wxNO_LEN
)
1798 srcLen
= wxWcslen(src
) + 1;
1800 srcLen
*= BYTES_PER_CHAR
;
1804 if ( dstLen
< srcLen
)
1805 return wxCONV_FAILED
;
1807 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1808 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1810 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1817 #endif // WC_UTF16/!WC_UTF16
1820 // ============================================================================
1821 // The classes doing conversion using the iconv_xxx() functions
1822 // ============================================================================
1826 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1827 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1828 // (unless there's yet another bug in glibc) the only case when iconv()
1829 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1830 // left in the input buffer -- when _real_ error occurs,
1831 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1833 // [This bug does not appear in glibc 2.2.]
1834 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1835 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1836 (errno != E2BIG || bufLeft != 0))
1838 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1841 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1843 #define ICONV_T_INVALID ((iconv_t)-1)
1845 #if SIZEOF_WCHAR_T == 4
1846 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1847 #define WC_ENC wxFONTENCODING_UTF32
1848 #elif SIZEOF_WCHAR_T == 2
1849 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1850 #define WC_ENC wxFONTENCODING_UTF16
1851 #else // sizeof(wchar_t) != 2 nor 4
1852 // does this ever happen?
1853 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1856 // ----------------------------------------------------------------------------
1857 // wxMBConv_iconv: encapsulates an iconv character set
1858 // ----------------------------------------------------------------------------
1860 class wxMBConv_iconv
: public wxMBConv
1863 wxMBConv_iconv(const char *name
);
1864 virtual ~wxMBConv_iconv();
1866 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const;
1867 virtual size_t WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const;
1869 // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1870 virtual size_t GetMBNulLen() const;
1872 #if wxUSE_UNICODE_UTF8
1873 virtual bool IsUTF8() const;
1876 virtual wxMBConv
*Clone() const
1878 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
1879 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1884 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1887 // the iconv handlers used to translate from multibyte
1888 // to wide char and in the other direction
1893 // guards access to m2w and w2m objects
1894 wxMutex m_iconvMutex
;
1898 // the name (for iconv_open()) of a wide char charset -- if none is
1899 // available on this machine, it will remain NULL
1900 static wxString ms_wcCharsetName
;
1902 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1903 // different endian-ness than the native one
1904 static bool ms_wcNeedsSwap
;
1907 // name of the encoding handled by this conversion
1910 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1912 size_t m_minMBCharWidth
;
1915 // make the constructor available for unit testing
1916 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
1918 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1919 if ( !result
->IsOk() )
1928 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1929 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1931 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
1934 m_minMBCharWidth
= 0;
1936 // check for charset that represents wchar_t:
1937 if ( ms_wcCharsetName
.empty() )
1939 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1942 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1943 #else // !wxUSE_FONTMAP
1944 static const wxChar
*names_static
[] =
1946 #if SIZEOF_WCHAR_T == 4
1948 #elif SIZEOF_WCHAR_T = 2
1953 const wxChar
**names
= names_static
;
1954 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1956 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1958 const wxString
nameCS(*names
);
1960 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1961 wxString
nameXE(nameCS
);
1963 #ifdef WORDS_BIGENDIAN
1965 #else // little endian
1969 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1972 m2w
= iconv_open(nameXE
.ToAscii(), name
);
1973 if ( m2w
== ICONV_T_INVALID
)
1975 // try charset w/o bytesex info (e.g. "UCS4")
1976 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1978 m2w
= iconv_open(nameCS
.ToAscii(), name
);
1980 // and check for bytesex ourselves:
1981 if ( m2w
!= ICONV_T_INVALID
)
1983 char buf
[2], *bufPtr
;
1992 outsz
= SIZEOF_WCHAR_T
* 2;
1993 char* wbufPtr
= (char*)wbuf
;
1997 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2000 if (ICONV_FAILED(res
, insz
))
2002 wxLogLastError(wxT("iconv"));
2003 wxLogError(_("Conversion to charset '%s' doesn't work."),
2006 else // ok, can convert to this encoding, remember it
2008 ms_wcCharsetName
= nameCS
;
2009 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2013 else // use charset not requiring byte swapping
2015 ms_wcCharsetName
= nameXE
;
2019 wxLogTrace(TRACE_STRCONV
,
2020 wxT("iconv wchar_t charset is \"%s\"%s"),
2021 ms_wcCharsetName
.empty() ? wxString("<none>")
2023 ms_wcNeedsSwap
? _T(" (needs swap)")
2026 else // we already have ms_wcCharsetName
2028 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2031 if ( ms_wcCharsetName
.empty() )
2033 w2m
= ICONV_T_INVALID
;
2037 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2038 if ( w2m
== ICONV_T_INVALID
)
2040 wxLogTrace(TRACE_STRCONV
,
2041 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2042 ms_wcCharsetName
.c_str(), name
);
2047 wxMBConv_iconv::~wxMBConv_iconv()
2049 if ( m2w
!= ICONV_T_INVALID
)
2051 if ( w2m
!= ICONV_T_INVALID
)
2055 size_t wxMBConv_iconv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2057 // find the string length: notice that must be done differently for
2058 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2060 const size_t nulLen
= GetMBNulLen();
2064 return wxCONV_FAILED
;
2067 inbuf
= strlen(psz
); // arguably more optimized than our version
2072 // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2073 // they also have to start at character boundary and not span two
2074 // adjacent characters
2076 for ( p
= psz
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2083 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2084 // Unfortunately there are a couple of global wxCSConv objects such as
2085 // wxConvLocal that are used all over wx code, so we have to make sure
2086 // the handle is used by at most one thread at the time. Otherwise
2087 // only a few wx classes would be safe to use from non-main threads
2088 // as MB<->WC conversion would fail "randomly".
2089 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2090 #endif // wxUSE_THREADS
2092 size_t outbuf
= n
* SIZEOF_WCHAR_T
;
2094 const char *pszPtr
= psz
;
2098 char* bufPtr
= (char*)buf
;
2100 // have destination buffer, convert there
2102 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
2104 res
= n
- (outbuf
/ SIZEOF_WCHAR_T
);
2108 // convert to native endianness
2109 for ( unsigned i
= 0; i
< res
; i
++ )
2110 buf
[n
] = WC_BSWAP(buf
[i
]);
2113 // NUL-terminate the string if there is any space left
2119 // no destination buffer... convert using temp buffer
2120 // to calculate destination buffer requirement
2126 char* bufPtr
= (char*)tbuf
;
2127 outbuf
= 8 * SIZEOF_WCHAR_T
;
2130 ICONV_CHAR_CAST(&pszPtr
), &inbuf
,
2133 res
+= 8 - (outbuf
/ SIZEOF_WCHAR_T
);
2135 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2138 if (ICONV_FAILED(cres
, inbuf
))
2140 //VS: it is ok if iconv fails, hence trace only
2141 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2142 return wxCONV_FAILED
;
2148 size_t wxMBConv_iconv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
2151 // NB: explained in MB2WC
2152 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2155 size_t inlen
= wxWcslen(psz
);
2156 size_t inbuflen
= inlen
* SIZEOF_WCHAR_T
;
2157 size_t outbuflen
= n
;
2160 wchar_t *tmpbuf
= 0;
2164 // need to copy to temp buffer to switch endianness
2165 // (doing WC_BSWAP twice on the original buffer won't help, as it
2166 // could be in read-only memory, or be accessed in some other thread)
2167 tmpbuf
= (wchar_t *)malloc(inbuflen
+ SIZEOF_WCHAR_T
);
2168 for ( size_t i
= 0; i
< inlen
; i
++ )
2169 tmpbuf
[n
] = WC_BSWAP(psz
[i
]);
2171 tmpbuf
[inlen
] = L
'\0';
2175 char* inbuf
= (char*)psz
;
2178 // have destination buffer, convert there
2179 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &buf
, &outbuflen
);
2181 res
= n
- outbuflen
;
2183 // NB: iconv was given only wcslen(psz) characters on input, and so
2184 // it couldn't convert the trailing zero. Let's do it ourselves
2185 // if there's some room left for it in the output buffer.
2191 // no destination buffer: convert using temp buffer
2192 // to calculate destination buffer requirement
2200 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &buf
, &outbuflen
);
2202 res
+= 16 - outbuflen
;
2204 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2212 if (ICONV_FAILED(cres
, inbuflen
))
2214 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2215 return wxCONV_FAILED
;
2221 size_t wxMBConv_iconv::GetMBNulLen() const
2223 if ( m_minMBCharWidth
== 0 )
2225 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2228 // NB: explained in MB2WC
2229 wxMutexLocker
lock(self
->m_iconvMutex
);
2232 const wchar_t *wnul
= L
"";
2233 char buf
[8]; // should be enough for NUL in any encoding
2234 size_t inLen
= sizeof(wchar_t),
2235 outLen
= WXSIZEOF(buf
);
2236 char *inBuff
= (char *)wnul
;
2237 char *outBuff
= buf
;
2238 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2240 self
->m_minMBCharWidth
= (size_t)-1;
2244 self
->m_minMBCharWidth
= outBuff
- buf
;
2248 return m_minMBCharWidth
;
2251 #if wxUSE_UNICODE_UTF8
2252 bool wxMBConv_iconv::IsUTF8() const
2254 return wxStricmp(m_name
, "UTF-8") == 0 ||
2255 wxStricmp(m_name
, "UTF8") == 0;
2259 #endif // HAVE_ICONV
2262 // ============================================================================
2263 // Win32 conversion classes
2264 // ============================================================================
2266 #ifdef wxHAVE_WIN32_MB2WC
2270 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2271 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2274 class wxMBConv_win32
: public wxMBConv
2279 m_CodePage
= CP_ACP
;
2280 m_minMBCharWidth
= 0;
2283 wxMBConv_win32(const wxMBConv_win32
& conv
)
2286 m_CodePage
= conv
.m_CodePage
;
2287 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2291 wxMBConv_win32(const char* name
)
2293 m_CodePage
= wxCharsetToCodepage(name
);
2294 m_minMBCharWidth
= 0;
2297 wxMBConv_win32(wxFontEncoding encoding
)
2299 m_CodePage
= wxEncodingToCodepage(encoding
);
2300 m_minMBCharWidth
= 0;
2302 #endif // wxUSE_FONTMAP
2304 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2306 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2307 // the behaviour is not compatible with the Unix version (using iconv)
2308 // and break the library itself, e.g. wxTextInputStream::NextChar()
2309 // wouldn't work if reading an incomplete MB char didn't result in an
2312 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2313 // Win XP or newer and it is not supported for UTF-[78] so we always
2314 // use our own conversions in this case. See
2315 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2316 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2317 if ( m_CodePage
== CP_UTF8
)
2319 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2322 if ( m_CodePage
== CP_UTF7
)
2324 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2328 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2329 IsAtLeastWin2kSP4() )
2331 flags
= MB_ERR_INVALID_CHARS
;
2334 const size_t len
= ::MultiByteToWideChar
2336 m_CodePage
, // code page
2337 flags
, // flags: fall on error
2338 psz
, // input string
2339 -1, // its length (NUL-terminated)
2340 buf
, // output string
2341 buf
? n
: 0 // size of output buffer
2345 // function totally failed
2346 return wxCONV_FAILED
;
2349 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2350 // check if we succeeded, by doing a double trip:
2351 if ( !flags
&& buf
)
2353 const size_t mbLen
= strlen(psz
);
2354 wxCharBuffer
mbBuf(mbLen
);
2355 if ( ::WideCharToMultiByte
2362 mbLen
+ 1, // size in bytes, not length
2366 strcmp(mbBuf
, psz
) != 0 )
2368 // we didn't obtain the same thing we started from, hence
2369 // the conversion was lossy and we consider that it failed
2370 return wxCONV_FAILED
;
2374 // note that it returns count of written chars for buf != NULL and size
2375 // of the needed buffer for buf == NULL so in either case the length of
2376 // the string (which never includes the terminating NUL) is one less
2380 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2383 we have a problem here: by default, WideCharToMultiByte() may
2384 replace characters unrepresentable in the target code page with bad
2385 quality approximations such as turning "1/2" symbol (U+00BD) into
2386 "1" for the code pages which don't have it and we, obviously, want
2387 to avoid this at any price
2389 the trouble is that this function does it _silently_, i.e. it won't
2390 even tell us whether it did or not... Win98/2000 and higher provide
2391 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2392 we have to resort to a round trip, i.e. check that converting back
2393 results in the same string -- this is, of course, expensive but
2394 otherwise we simply can't be sure to not garble the data.
2397 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2398 // it doesn't work with CJK encodings (which we test for rather roughly
2399 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2401 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2404 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2406 // it's our lucky day
2407 flags
= WC_NO_BEST_FIT_CHARS
;
2408 pUsedDef
= &usedDef
;
2410 else // old system or unsupported encoding
2416 const size_t len
= ::WideCharToMultiByte
2418 m_CodePage
, // code page
2419 flags
, // either none or no best fit
2420 pwz
, // input string
2421 -1, // it is (wide) NUL-terminated
2422 buf
, // output buffer
2423 buf
? n
: 0, // and its size
2424 NULL
, // default "replacement" char
2425 pUsedDef
// [out] was it used?
2430 // function totally failed
2431 return wxCONV_FAILED
;
2434 // we did something, check if we really succeeded
2437 // check if the conversion failed, i.e. if any replacements
2440 return wxCONV_FAILED
;
2442 else // we must resort to double tripping...
2444 // first we need to ensure that we really have the MB data: this is
2445 // not the case if we're called with NULL buffer, in which case we
2446 // need to do the conversion yet again
2447 wxCharBuffer bufDef
;
2450 bufDef
= wxCharBuffer(len
);
2451 buf
= bufDef
.data();
2452 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2453 buf
, len
, NULL
, NULL
) )
2454 return wxCONV_FAILED
;
2459 wxWCharBuffer
wcBuf(n
);
2460 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2461 wcscmp(wcBuf
, pwz
) != 0 )
2463 // we didn't obtain the same thing we started from, hence
2464 // the conversion was lossy and we consider that it failed
2465 return wxCONV_FAILED
;
2469 // see the comment above for the reason of "len - 1"
2473 virtual size_t GetMBNulLen() const
2475 if ( m_minMBCharWidth
== 0 )
2477 int len
= ::WideCharToMultiByte
2479 m_CodePage
, // code page
2481 L
"", // input string
2482 1, // translate just the NUL
2483 NULL
, // output buffer
2485 NULL
, // no replacement char
2486 NULL
// [out] don't care if it was used
2489 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2493 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2494 self
->m_minMBCharWidth
= (size_t)-1;
2498 self
->m_minMBCharWidth
= (size_t)-1;
2504 self
->m_minMBCharWidth
= len
;
2509 return m_minMBCharWidth
;
2512 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2514 bool IsOk() const { return m_CodePage
!= -1; }
2517 static bool CanUseNoBestFit()
2519 static int s_isWin98Or2k
= -1;
2521 if ( s_isWin98Or2k
== -1 )
2524 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2526 case wxOS_WINDOWS_9X
:
2527 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2530 case wxOS_WINDOWS_NT
:
2531 s_isWin98Or2k
= verMaj
>= 5;
2535 // unknown: be conservative by default
2540 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2543 return s_isWin98Or2k
== 1;
2546 static bool IsAtLeastWin2kSP4()
2551 static int s_isAtLeastWin2kSP4
= -1;
2553 if ( s_isAtLeastWin2kSP4
== -1 )
2555 OSVERSIONINFOEX ver
;
2557 memset(&ver
, 0, sizeof(ver
));
2558 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2559 GetVersionEx((OSVERSIONINFO
*)&ver
);
2561 s_isAtLeastWin2kSP4
=
2562 ((ver
.dwMajorVersion
> 5) || // Vista+
2563 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2564 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2565 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2569 return s_isAtLeastWin2kSP4
== 1;
2574 // the code page we're working with
2577 // cached result of GetMBNulLen(), set to 0 initially meaning
2579 size_t m_minMBCharWidth
;
2582 #endif // wxHAVE_WIN32_MB2WC
2585 // ============================================================================
2586 // wxEncodingConverter based conversion classes
2587 // ============================================================================
2591 class wxMBConv_wxwin
: public wxMBConv
2596 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2597 // The wxMBConv_cf class does a better job.
2598 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2599 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2600 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2604 // temporarily just use wxEncodingConverter stuff,
2605 // so that it works while a better implementation is built
2606 wxMBConv_wxwin(const char* name
)
2609 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2611 m_enc
= wxFONTENCODING_SYSTEM
;
2616 wxMBConv_wxwin(wxFontEncoding enc
)
2623 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2625 size_t inbuf
= strlen(psz
);
2628 if (!m2w
.Convert(psz
, buf
))
2629 return wxCONV_FAILED
;
2634 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2636 const size_t inbuf
= wxWcslen(psz
);
2639 if (!w2m
.Convert(psz
, buf
))
2640 return wxCONV_FAILED
;
2646 virtual size_t GetMBNulLen() const
2650 case wxFONTENCODING_UTF16BE
:
2651 case wxFONTENCODING_UTF16LE
:
2654 case wxFONTENCODING_UTF32BE
:
2655 case wxFONTENCODING_UTF32LE
:
2663 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2665 bool IsOk() const { return m_ok
; }
2668 wxFontEncoding m_enc
;
2669 wxEncodingConverter m2w
, w2m
;
2672 // were we initialized successfully?
2675 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2678 // make the constructors available for unit testing
2679 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2681 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2682 if ( !result
->IsOk() )
2691 #endif // wxUSE_FONTMAP
2693 // ============================================================================
2694 // wxCSConv implementation
2695 // ============================================================================
2697 void wxCSConv::Init()
2704 wxCSConv::wxCSConv(const wxString
& charset
)
2708 if ( !charset
.empty() )
2710 SetName(charset
.ToAscii());
2714 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2716 m_encoding
= wxFONTENCODING_SYSTEM
;
2720 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2722 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2724 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2726 encoding
= wxFONTENCODING_SYSTEM
;
2731 m_encoding
= encoding
;
2734 wxCSConv::~wxCSConv()
2739 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2744 SetName(conv
.m_name
);
2745 m_encoding
= conv
.m_encoding
;
2748 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2752 SetName(conv
.m_name
);
2753 m_encoding
= conv
.m_encoding
;
2758 void wxCSConv::Clear()
2767 void wxCSConv::SetName(const char *charset
)
2771 m_name
= wxStrdup(charset
);
2778 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2779 wxEncodingNameCache
);
2781 static wxEncodingNameCache gs_nameCache
;
2784 wxMBConv
*wxCSConv::DoCreate() const
2787 wxLogTrace(TRACE_STRCONV
,
2788 wxT("creating conversion for %s"),
2790 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2791 #endif // wxUSE_FONTMAP
2793 // check for the special case of ASCII or ISO8859-1 charset: as we have
2794 // special knowledge of it anyhow, we don't need to create a special
2795 // conversion object
2796 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2797 m_encoding
== wxFONTENCODING_DEFAULT
)
2799 // don't convert at all
2803 // we trust OS to do conversion better than we can so try external
2804 // conversion methods first
2806 // the full order is:
2807 // 1. OS conversion (iconv() under Unix or Win32 API)
2808 // 2. hard coded conversions for UTF
2809 // 3. wxEncodingConverter as fall back
2815 #endif // !wxUSE_FONTMAP
2818 wxFontEncoding
encoding(m_encoding
);
2823 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2831 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2832 #endif // wxUSE_FONTMAP
2836 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2837 if ( it
!= gs_nameCache
.end() )
2839 if ( it
->second
.empty() )
2842 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
2849 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2850 // CS : in case this does not return valid names (eg for MacRoman)
2851 // encoding got a 'failure' entry in the cache all the same,
2852 // although it just has to be created using a different method, so
2853 // only store failed iconv creation attempts (or perhaps we
2854 // shoulnd't do this at all ?)
2855 if ( names
[0] != NULL
)
2857 for ( ; *names
; ++names
)
2859 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2860 // will need changes that will obsolete this
2861 wxString
name(*names
);
2862 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
2865 gs_nameCache
[encoding
] = *names
;
2872 gs_nameCache
[encoding
] = _T(""); // cache the failure
2875 #endif // wxUSE_FONTMAP
2877 #endif // HAVE_ICONV
2879 #ifdef wxHAVE_WIN32_MB2WC
2882 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2883 : new wxMBConv_win32(m_encoding
);
2892 #endif // wxHAVE_WIN32_MB2WC
2896 // leave UTF16 and UTF32 to the built-ins of wx
2897 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2898 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2901 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
2902 : new wxMBConv_cf(m_encoding
);
2904 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
2913 #endif // __DARWIN__
2916 wxFontEncoding enc
= m_encoding
;
2918 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2920 // use "false" to suppress interactive dialogs -- we can be called from
2921 // anywhere and popping up a dialog from here is the last thing we want to
2923 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2925 #endif // wxUSE_FONTMAP
2929 case wxFONTENCODING_UTF7
:
2930 return new wxMBConvUTF7
;
2932 case wxFONTENCODING_UTF8
:
2933 return new wxMBConvUTF8
;
2935 case wxFONTENCODING_UTF16BE
:
2936 return new wxMBConvUTF16BE
;
2938 case wxFONTENCODING_UTF16LE
:
2939 return new wxMBConvUTF16LE
;
2941 case wxFONTENCODING_UTF32BE
:
2942 return new wxMBConvUTF32BE
;
2944 case wxFONTENCODING_UTF32LE
:
2945 return new wxMBConvUTF32LE
;
2948 // nothing to do but put here to suppress gcc warnings
2955 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2956 : new wxMBConv_wxwin(m_encoding
);
2962 #endif // wxUSE_FONTMAP
2964 // NB: This is a hack to prevent deadlock. What could otherwise happen
2965 // in Unicode build: wxConvLocal creation ends up being here
2966 // because of some failure and logs the error. But wxLog will try to
2967 // attach a timestamp, for which it will need wxConvLocal (to convert
2968 // time to char* and then wchar_t*), but that fails, tries to log the
2969 // error, but wxLog has an (already locked) critical section that
2970 // guards the static buffer.
2971 static bool alreadyLoggingError
= false;
2972 if (!alreadyLoggingError
)
2974 alreadyLoggingError
= true;
2975 wxLogError(_("Cannot convert from the charset '%s'!"),
2979 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
2980 #else // !wxUSE_FONTMAP
2981 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
2982 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2985 alreadyLoggingError
= false;
2991 void wxCSConv::CreateConvIfNeeded() const
2995 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
2997 // if we don't have neither the name nor the encoding, use the default
2998 // encoding for this system
2999 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3002 self
->m_encoding
= wxLocale::GetSystemEncoding();
3004 // fallback to some reasonable default:
3005 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3006 #endif // wxUSE_INTL
3009 self
->m_convReal
= DoCreate();
3010 self
->m_deferred
= false;
3014 bool wxCSConv::IsOk() const
3016 CreateConvIfNeeded();
3018 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3019 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3020 return true; // always ok as we do it ourselves
3022 // m_convReal->IsOk() is called at its own creation, so we know it must
3023 // be ok if m_convReal is non-NULL
3024 return m_convReal
!= NULL
;
3027 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3028 const char *src
, size_t srcLen
) const
3030 CreateConvIfNeeded();
3033 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3036 return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
);
3039 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3040 const wchar_t *src
, size_t srcLen
) const
3042 CreateConvIfNeeded();
3045 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3048 return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
);
3051 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3053 CreateConvIfNeeded();
3056 return m_convReal
->MB2WC(buf
, psz
, n
);
3059 size_t len
= strlen(psz
);
3063 for (size_t c
= 0; c
<= len
; c
++)
3064 buf
[c
] = (unsigned char)(psz
[c
]);
3070 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3072 CreateConvIfNeeded();
3075 return m_convReal
->WC2MB(buf
, psz
, n
);
3078 const size_t len
= wxWcslen(psz
);
3081 for (size_t c
= 0; c
<= len
; c
++)
3084 return wxCONV_FAILED
;
3086 buf
[c
] = (char)psz
[c
];
3091 for (size_t c
= 0; c
<= len
; c
++)
3094 return wxCONV_FAILED
;
3101 size_t wxCSConv::GetMBNulLen() const
3103 CreateConvIfNeeded();
3107 return m_convReal
->GetMBNulLen();
3110 // otherwise, we are ISO-8859-1
3114 #if wxUSE_UNICODE_UTF8
3115 bool wxCSConv::IsUTF8() const
3117 CreateConvIfNeeded();
3121 return m_convReal
->IsUTF8();
3124 // otherwise, we are ISO-8859-1
3132 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3135 return wxWCharBuffer();
3137 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3139 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3141 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3146 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3149 return wxCharBuffer();
3151 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3153 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3158 #endif // wxUSE_UNICODE
3160 // ----------------------------------------------------------------------------
3162 // ----------------------------------------------------------------------------
3164 // NB: The reason why we create converted objects in this convoluted way,
3165 // using a factory function instead of global variable, is that they
3166 // may be used at static initialization time (some of them are used by
3167 // wxString ctors and there may be a global wxString object). In other
3168 // words, possibly _before_ the converter global object would be
3175 #undef wxConvISO8859_1
3177 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3178 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3179 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3181 static impl_klass name##Obj ctor_args; \
3182 return &name##Obj; \
3184 /* this ensures that all global converter objects are created */ \
3185 /* by the time static initialization is done, i.e. before any */ \
3186 /* thread is launched: */ \
3187 static klass* gs_##name##instance = wxGet_##name##Ptr()
3189 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3190 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3193 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3195 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3198 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3199 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3200 // provokes an error message about "not enough macro parameters"; and we
3201 // can't use "()" here as the name##Obj declaration would be parsed as a
3202 // function declaration then, so use a semicolon and live with an extra
3203 // empty statement (and hope that no compilers warns about this)
3204 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3205 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3207 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3208 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3210 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3211 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3214 // The xnu kernel always communicates file paths in decomposed UTF-8.
3215 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3216 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3219 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3222 #else // !__DARWIN__
3223 wxGet_wxConvLibcPtr();
3224 #endif // __DARWIN__/!__DARWIN__
3226 #else // !wxUSE_WCHAR_T
3228 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3229 // stand-ins in absence of wchar_t
3230 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3235 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T