1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/mac/corefoundation/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV _T("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten
= 0;
172 // the number of NULs terminating this string
173 size_t nulLen
= 0; // not really needed, but just to avoid warnings
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
181 if ( srcLen
!= wxNO_LEN
)
183 // we need to know how to find the end of this string
184 nulLen
= GetMBNulLen();
185 if ( nulLen
== wxCONV_FAILED
)
186 return wxCONV_FAILED
;
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
193 char * const p
= bufTmp
.data();
194 memcpy(p
, src
, srcLen
);
195 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
201 srcEnd
= src
+ srcLen
;
203 else // quit after the first loop iteration
210 // try to convert the current chunk
211 size_t lenChunk
= MB2WC(NULL
, src
, 0);
212 if ( lenChunk
== wxCONV_FAILED
)
213 return wxCONV_FAILED
;
215 lenChunk
++; // for the L'\0' at the end of this chunk
217 dstWritten
+= lenChunk
;
221 // nothing left in the input string, conversion succeeded
227 if ( dstWritten
> dstLen
)
228 return wxCONV_FAILED
;
230 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
231 return wxCONV_FAILED
;
238 // we convert just one chunk in this case as this is the entire
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src
, nulLen
) )
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
253 src
+= nulLen
; // skipping over its terminator as well
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
266 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
267 const wchar_t *src
, size_t srcLen
) const
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten
= 0;
272 // make a copy of the input string unless it is already properly
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp
;
278 if ( srcLen
== wxNO_LEN
)
280 srcLen
= wxWcslen(src
) + 1;
282 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp
= wxWCharBuffer(srcLen
);
286 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
290 const size_t lenNul
= GetMBNulLen();
291 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
293 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
295 // try to convert the current chunk
296 size_t lenChunk
= WC2MB(NULL
, src
, 0);
298 if ( lenChunk
== wxCONV_FAILED
)
299 return wxCONV_FAILED
;
302 dstWritten
+= lenChunk
;
306 if ( dstWritten
> dstLen
)
307 return wxCONV_FAILED
;
309 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
310 return wxCONV_FAILED
;
319 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
321 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
322 if ( rc
!= wxCONV_FAILED
)
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
332 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
334 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
335 if ( rc
!= wxCONV_FAILED
)
343 wxMBConv::~wxMBConv()
345 // nothing to do here (necessary for Darwin linking probably)
348 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
352 // calculate the length of the buffer needed first
353 const size_t nLen
= ToWChar(NULL
, 0, psz
);
354 if ( nLen
!= wxCONV_FAILED
)
356 // now do the actual conversion
357 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
359 // +1 for the trailing NULL
360 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
365 return wxWCharBuffer();
368 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
372 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
373 if ( nLen
!= wxCONV_FAILED
)
375 wxCharBuffer
buf(nLen
- 1);
376 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
381 return wxCharBuffer();
385 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
387 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
388 if ( dstLen
!= wxCONV_FAILED
)
390 // notice that we allocate space for dstLen+1 wide characters here
391 // because we want the buffer to always be NUL-terminated, even if the
392 // input isn't (as otherwise the caller has no way to know its length)
393 wxWCharBuffer
wbuf(dstLen
);
394 wbuf
.data()[dstLen
- 1] = L
'\0';
395 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
400 if ( wbuf
[dstLen
- 1] == L
'\0' )
411 return wxWCharBuffer();
415 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
417 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
418 if ( dstLen
!= wxCONV_FAILED
)
420 const size_t nulLen
= GetMBNulLen();
422 // as above, ensure that the buffer is always NUL-terminated, even if
424 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
425 memset(buf
.data() + dstLen
, 0, nulLen
);
426 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
432 if ( dstLen
>= nulLen
&&
433 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
435 // in this case the output is NUL-terminated and we're not
436 // supposed to count NUL
448 return wxCharBuffer();
451 // ----------------------------------------------------------------------------
453 // ----------------------------------------------------------------------------
455 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
457 return wxMB2WC(buf
, psz
, n
);
460 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
462 return wxWC2MB(buf
, psz
, n
);
465 // ----------------------------------------------------------------------------
466 // wxConvBrokenFileNames
467 // ----------------------------------------------------------------------------
471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
473 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
474 wxStricmp(charset
, _T("UTF8")) == 0 )
475 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
477 m_conv
= new wxCSConv(charset
);
482 // ----------------------------------------------------------------------------
484 // ----------------------------------------------------------------------------
486 // Implementation (C) 2004 Fredrik Roubert
489 // BASE64 decoding table
491 static const unsigned char utf7unb64
[] =
493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
494 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
499 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
500 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
501 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
502 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
503 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
504 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
505 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
506 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
507 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
508 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
527 size_t wxMBConvUTF7::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
531 while ( *psz
&& (!buf
|| (len
< n
)) )
533 unsigned char cc
= *psz
++;
541 else if (*psz
== '-')
549 else // start of BASE64 encoded string
553 for ( ok
= lsb
= false, d
= 0, l
= 0;
554 (cc
= utf7unb64
[(unsigned char)*psz
]) != 0xff;
559 for (l
+= 6; l
>= 8; lsb
= !lsb
)
561 unsigned char c
= (unsigned char)((d
>> (l
-= 8)) % 256);
572 *buf
= (wchar_t)(c
<< 8);
579 // in valid UTF7 we should have valid characters after '+'
580 return wxCONV_FAILED
;
588 if ( buf
&& (len
< n
) )
595 // BASE64 encoding table
597 static const unsigned char utf7enb64
[] =
599 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
600 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
601 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
602 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
603 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
604 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
605 'w', 'x', 'y', 'z', '0', '1', '2', '3',
606 '4', '5', '6', '7', '8', '9', '+', '/'
610 // UTF-7 encoding table
612 // 0 - Set D (directly encoded characters)
613 // 1 - Set O (optional direct characters)
614 // 2 - whitespace characters (optional)
615 // 3 - special characters
617 static const unsigned char utf7encode
[128] =
619 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
620 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
621 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
623 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
625 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
629 size_t wxMBConvUTF7::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
633 while (*psz
&& ((!buf
) || (len
< n
)))
636 if (cc
< 0x80 && utf7encode
[cc
] < 1)
645 else if (((wxUint32
)cc
) > 0xffff)
647 // no surrogate pair generation (yet?)
648 return wxCONV_FAILED
;
659 // BASE64 encode string
660 unsigned int lsb
, d
, l
;
661 for (d
= 0, l
= 0; /*nothing*/; psz
++)
663 for (lsb
= 0; lsb
< 2; lsb
++)
666 d
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
668 for (l
+= 8; l
>= 6; )
672 *buf
++ = utf7enb64
[(d
>> l
) % 64];
678 if (!(cc
) || (cc
< 0x80 && utf7encode
[cc
] < 1))
685 *buf
++ = utf7enb64
[((d
% 16) << (6 - l
)) % 64];
697 if (buf
&& (len
< n
))
703 // ----------------------------------------------------------------------------
705 // ----------------------------------------------------------------------------
707 static const wxUint32 utf8_max
[]=
708 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
710 // boundaries of the private use area we use to (temporarily) remap invalid
711 // characters invalid in a UTF-8 encoded string
712 const wxUint32 wxUnicodePUA
= 0x100000;
713 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
715 // this table gives the length of the UTF-8 encoding from its first character:
716 const unsigned char tableUtf8Lengths
[256] = {
717 // single-byte sequences (ASCII):
718 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
719 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
720 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
721 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
722 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
723 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
724 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
725 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
727 // these are invalid:
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
731 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
734 // two-byte sequences:
735 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
736 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
738 // three-byte sequences:
739 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
741 // four-byte sequences:
742 4, 4, 4, 4, 4, // F0..F4
744 // these are invalid again (5- or 6-byte
745 // sequences and sequences for code points
746 // above U+10FFFF, as restricted by RFC 3629):
747 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
751 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
752 const char *src
, size_t srcLen
) const
754 wchar_t *out
= dstLen
? dst
: NULL
;
757 if ( srcLen
== wxNO_LEN
)
758 srcLen
= strlen(src
) + 1;
760 for ( const char *p
= src
; ; p
++ )
762 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
764 // all done successfully, just add the trailing NULL if we are not
765 // using explicit length
766 if ( srcLen
== wxNO_LEN
)
782 if ( out
&& !dstLen
-- )
786 unsigned char c
= *p
;
790 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
793 if ( srcLen
!= wxNO_LEN
)
800 unsigned len
= tableUtf8Lengths
[c
];
804 if ( srcLen
< len
) // the test works for wxNO_LEN too
807 if ( srcLen
!= wxNO_LEN
)
810 // Char. number range | UTF-8 octet sequence
811 // (hexadecimal) | (binary)
812 // ----------------------+----------------------------------------
813 // 0000 0000 - 0000 007F | 0xxxxxxx
814 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
815 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
816 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
818 // Code point value is stored in bits marked with 'x',
819 // lowest-order bit of the value on the right side in the diagram
820 // above. (from RFC 3629)
822 // mask to extract lead byte's value ('x' bits above), by sequence
824 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
826 // mask and value of lead byte's most significant bits, by length:
827 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
828 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
830 len
--; // it's more convenient to work with 0-based length here
832 // extract the lead byte's value bits:
833 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
836 code
= c
& leadValueMask
[len
];
838 // all remaining bytes, if any, are handled in the same way
839 // regardless of sequence's length:
843 if ( (c
& 0xC0) != 0x80 )
844 return wxCONV_FAILED
;
852 // cast is ok because wchar_t == wxUint16 if WC_UTF16
853 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
862 #endif // WC_UTF16/!WC_UTF16
870 return wxCONV_FAILED
;
874 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
875 const wchar_t *src
, size_t srcLen
) const
877 char *out
= dstLen
? dst
: NULL
;
880 for ( const wchar_t *wp
= src
; ; wp
++ )
882 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
--) )
884 // all done successfully, just add the trailing NULL if we are not
885 // using explicit length
886 if ( srcLen
== wxNO_LEN
)
905 // cast is ok for WC_UTF16
906 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
908 // skip the next char too as we decoded a surrogate
911 #else // wchar_t is UTF-32
912 code
= *wp
& 0x7fffffff;
927 else if ( code
<= 0x07FF )
935 // NB: this line takes 6 least significant bits, encodes them as
936 // 10xxxxxx and discards them so that the next byte can be encoded:
937 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
938 out
[0] = 0xC0 | code
;
941 else if ( code
< 0xFFFF )
949 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
950 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
951 out
[0] = 0xE0 | code
;
954 else if ( code
<= 0x10FFFF )
962 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
963 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
964 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
965 out
[0] = 0xF0 | code
;
970 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
983 // we only get here if an error occurs during decoding
984 return wxCONV_FAILED
;
987 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
988 const char *psz
, size_t srcLen
) const
990 if ( m_options
== MAP_INVALID_UTF8_NOT
)
991 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
995 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
997 const char *opsz
= psz
;
998 bool invalid
= false;
999 unsigned char cc
= *psz
++, fc
= cc
;
1001 for (cnt
= 0; fc
& 0x80; cnt
++)
1011 // escape the escape character for octal escapes
1012 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1013 && cc
== '\\' && (!buf
|| len
< n
))
1025 // invalid UTF-8 sequence
1030 unsigned ocnt
= cnt
- 1;
1031 wxUint32 res
= cc
& (0x3f >> cnt
);
1035 if ((cc
& 0xC0) != 0x80)
1037 // invalid UTF-8 sequence
1043 res
= (res
<< 6) | (cc
& 0x3f);
1046 if (invalid
|| res
<= utf8_max
[ocnt
])
1048 // illegal UTF-8 encoding
1051 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1052 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1054 // if one of our PUA characters turns up externally
1055 // it must also be treated as an illegal sequence
1056 // (a bit like you have to escape an escape character)
1062 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1063 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1064 if (pa
== wxCONV_FAILED
)
1076 *buf
++ = (wchar_t)res
;
1078 #endif // WC_UTF16/!WC_UTF16
1084 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1086 while (opsz
< psz
&& (!buf
|| len
< n
))
1089 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1090 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1091 wxASSERT(pa
!= wxCONV_FAILED
);
1098 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1104 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1106 while (opsz
< psz
&& (!buf
|| len
< n
))
1108 if ( buf
&& len
+ 3 < n
)
1110 unsigned char on
= *opsz
;
1112 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1113 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1114 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1121 else // MAP_INVALID_UTF8_NOT
1123 return wxCONV_FAILED
;
1129 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1135 static inline bool isoctal(wchar_t wch
)
1137 return L
'0' <= wch
&& wch
<= L
'7';
1140 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1141 const wchar_t *psz
, size_t srcLen
) const
1143 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1144 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1148 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1153 // cast is ok for WC_UTF16
1154 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1155 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1157 cc
= (*psz
++) & 0x7fffffff;
1160 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1161 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1164 *buf
++ = (char)(cc
- wxUnicodePUA
);
1167 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1168 && cc
== L
'\\' && psz
[0] == L
'\\' )
1175 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1177 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1181 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1182 (psz
[1] - L
'0') * 010 +
1192 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1208 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1210 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1216 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1222 // ============================================================================
1224 // ============================================================================
1226 #ifdef WORDS_BIGENDIAN
1227 #define wxMBConvUTF16straight wxMBConvUTF16BE
1228 #define wxMBConvUTF16swap wxMBConvUTF16LE
1230 #define wxMBConvUTF16swap wxMBConvUTF16BE
1231 #define wxMBConvUTF16straight wxMBConvUTF16LE
1235 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1237 if ( srcLen
== wxNO_LEN
)
1239 // count the number of bytes in input, including the trailing NULs
1240 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1241 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1244 srcLen
*= BYTES_PER_CHAR
;
1246 else // we already have the length
1248 // we can only convert an entire number of UTF-16 characters
1249 if ( srcLen
% BYTES_PER_CHAR
)
1250 return wxCONV_FAILED
;
1256 // case when in-memory representation is UTF-16 too
1259 // ----------------------------------------------------------------------------
1260 // conversions without endianness change
1261 // ----------------------------------------------------------------------------
1264 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1265 const char *src
, size_t srcLen
) const
1267 // set up the scene for using memcpy() (which is presumably more efficient
1268 // than copying the bytes one by one)
1269 srcLen
= GetLength(src
, srcLen
);
1270 if ( srcLen
== wxNO_LEN
)
1271 return wxCONV_FAILED
;
1273 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1276 if ( dstLen
< inLen
)
1277 return wxCONV_FAILED
;
1279 memcpy(dst
, src
, srcLen
);
1286 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1287 const wchar_t *src
, size_t srcLen
) const
1289 if ( srcLen
== wxNO_LEN
)
1290 srcLen
= wxWcslen(src
) + 1;
1292 srcLen
*= BYTES_PER_CHAR
;
1296 if ( dstLen
< srcLen
)
1297 return wxCONV_FAILED
;
1299 memcpy(dst
, src
, srcLen
);
1305 // ----------------------------------------------------------------------------
1306 // endian-reversing conversions
1307 // ----------------------------------------------------------------------------
1310 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1311 const char *src
, size_t srcLen
) const
1313 srcLen
= GetLength(src
, srcLen
);
1314 if ( srcLen
== wxNO_LEN
)
1315 return wxCONV_FAILED
;
1317 srcLen
/= BYTES_PER_CHAR
;
1321 if ( dstLen
< srcLen
)
1322 return wxCONV_FAILED
;
1324 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1325 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1327 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1335 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1336 const wchar_t *src
, size_t srcLen
) const
1338 if ( srcLen
== wxNO_LEN
)
1339 srcLen
= wxWcslen(src
) + 1;
1341 srcLen
*= BYTES_PER_CHAR
;
1345 if ( dstLen
< srcLen
)
1346 return wxCONV_FAILED
;
1348 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1349 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1351 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1358 #else // !WC_UTF16: wchar_t is UTF-32
1360 // ----------------------------------------------------------------------------
1361 // conversions without endianness change
1362 // ----------------------------------------------------------------------------
1365 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1366 const char *src
, size_t srcLen
) const
1368 srcLen
= GetLength(src
, srcLen
);
1369 if ( srcLen
== wxNO_LEN
)
1370 return wxCONV_FAILED
;
1372 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1375 // optimization: return maximal space which could be needed for this
1376 // string even if the real size could be smaller if the buffer contains
1382 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1383 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1385 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1387 return wxCONV_FAILED
;
1389 if ( ++outLen
> dstLen
)
1390 return wxCONV_FAILED
;
1400 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1401 const wchar_t *src
, size_t srcLen
) const
1403 if ( srcLen
== wxNO_LEN
)
1404 srcLen
= wxWcslen(src
) + 1;
1407 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1408 for ( size_t n
= 0; n
< srcLen
; n
++ )
1411 const size_t numChars
= encode_utf16(*src
++, cc
);
1412 if ( numChars
== wxCONV_FAILED
)
1413 return wxCONV_FAILED
;
1415 outLen
+= numChars
* BYTES_PER_CHAR
;
1418 if ( outLen
> dstLen
)
1419 return wxCONV_FAILED
;
1422 if ( numChars
== 2 )
1424 // second character of a surrogate
1433 // ----------------------------------------------------------------------------
1434 // endian-reversing conversions
1435 // ----------------------------------------------------------------------------
1438 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1439 const char *src
, size_t srcLen
) const
1441 srcLen
= GetLength(src
, srcLen
);
1442 if ( srcLen
== wxNO_LEN
)
1443 return wxCONV_FAILED
;
1445 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1448 // optimization: return maximal space which could be needed for this
1449 // string even if the real size could be smaller if the buffer contains
1455 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1456 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1461 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1463 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1465 const size_t numChars
= decode_utf16(tmp
, ch
);
1466 if ( numChars
== wxCONV_FAILED
)
1467 return wxCONV_FAILED
;
1469 if ( numChars
== 2 )
1472 if ( ++outLen
> dstLen
)
1473 return wxCONV_FAILED
;
1483 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1484 const wchar_t *src
, size_t srcLen
) const
1486 if ( srcLen
== wxNO_LEN
)
1487 srcLen
= wxWcslen(src
) + 1;
1490 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1491 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1494 const size_t numChars
= encode_utf16(*src
, cc
);
1495 if ( numChars
== wxCONV_FAILED
)
1496 return wxCONV_FAILED
;
1498 outLen
+= numChars
* BYTES_PER_CHAR
;
1501 if ( outLen
> dstLen
)
1502 return wxCONV_FAILED
;
1504 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1505 if ( numChars
== 2 )
1507 // second character of a surrogate
1508 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1516 #endif // WC_UTF16/!WC_UTF16
1519 // ============================================================================
1521 // ============================================================================
1523 #ifdef WORDS_BIGENDIAN
1524 #define wxMBConvUTF32straight wxMBConvUTF32BE
1525 #define wxMBConvUTF32swap wxMBConvUTF32LE
1527 #define wxMBConvUTF32swap wxMBConvUTF32BE
1528 #define wxMBConvUTF32straight wxMBConvUTF32LE
1532 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1533 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1536 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1538 if ( srcLen
== wxNO_LEN
)
1540 // count the number of bytes in input, including the trailing NULs
1541 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1542 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1545 srcLen
*= BYTES_PER_CHAR
;
1547 else // we already have the length
1549 // we can only convert an entire number of UTF-32 characters
1550 if ( srcLen
% BYTES_PER_CHAR
)
1551 return wxCONV_FAILED
;
1557 // case when in-memory representation is UTF-16
1560 // ----------------------------------------------------------------------------
1561 // conversions without endianness change
1562 // ----------------------------------------------------------------------------
1565 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1566 const char *src
, size_t srcLen
) const
1568 srcLen
= GetLength(src
, srcLen
);
1569 if ( srcLen
== wxNO_LEN
)
1570 return wxCONV_FAILED
;
1572 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1573 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1575 for ( size_t n
= 0; n
< inLen
; n
++ )
1578 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1579 if ( numChars
== wxCONV_FAILED
)
1580 return wxCONV_FAILED
;
1585 if ( outLen
> dstLen
)
1586 return wxCONV_FAILED
;
1589 if ( numChars
== 2 )
1591 // second character of a surrogate
1601 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1602 const wchar_t *src
, size_t srcLen
) const
1604 if ( srcLen
== wxNO_LEN
)
1605 srcLen
= wxWcslen(src
) + 1;
1609 // optimization: return maximal space which could be needed for this
1610 // string instead of the exact amount which could be less if there are
1611 // any surrogates in the input
1613 // we consider that surrogates are rare enough to make it worthwhile to
1614 // avoid running the loop below at the cost of slightly extra memory
1616 return srcLen
* BYTES_PER_CHAR
;
1619 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1621 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1623 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1625 return wxCONV_FAILED
;
1627 outLen
+= BYTES_PER_CHAR
;
1629 if ( outLen
> dstLen
)
1630 return wxCONV_FAILED
;
1638 // ----------------------------------------------------------------------------
1639 // endian-reversing conversions
1640 // ----------------------------------------------------------------------------
1643 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1644 const char *src
, size_t srcLen
) const
1646 srcLen
= GetLength(src
, srcLen
);
1647 if ( srcLen
== wxNO_LEN
)
1648 return wxCONV_FAILED
;
1650 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1651 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1653 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1656 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1657 if ( numChars
== wxCONV_FAILED
)
1658 return wxCONV_FAILED
;
1663 if ( outLen
> dstLen
)
1664 return wxCONV_FAILED
;
1667 if ( numChars
== 2 )
1669 // second character of a surrogate
1679 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1680 const wchar_t *src
, size_t srcLen
) const
1682 if ( srcLen
== wxNO_LEN
)
1683 srcLen
= wxWcslen(src
) + 1;
1687 // optimization: return maximal space which could be needed for this
1688 // string instead of the exact amount which could be less if there are
1689 // any surrogates in the input
1691 // we consider that surrogates are rare enough to make it worthwhile to
1692 // avoid running the loop below at the cost of slightly extra memory
1694 return srcLen
*BYTES_PER_CHAR
;
1697 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1699 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1701 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1703 return wxCONV_FAILED
;
1705 outLen
+= BYTES_PER_CHAR
;
1707 if ( outLen
> dstLen
)
1708 return wxCONV_FAILED
;
1710 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1716 #else // !WC_UTF16: wchar_t is UTF-32
1718 // ----------------------------------------------------------------------------
1719 // conversions without endianness change
1720 // ----------------------------------------------------------------------------
1723 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1724 const char *src
, size_t srcLen
) const
1726 // use memcpy() as it should be much faster than hand-written loop
1727 srcLen
= GetLength(src
, srcLen
);
1728 if ( srcLen
== wxNO_LEN
)
1729 return wxCONV_FAILED
;
1731 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1734 if ( dstLen
< inLen
)
1735 return wxCONV_FAILED
;
1737 memcpy(dst
, src
, srcLen
);
1744 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1745 const wchar_t *src
, size_t srcLen
) const
1747 if ( srcLen
== wxNO_LEN
)
1748 srcLen
= wxWcslen(src
) + 1;
1750 srcLen
*= BYTES_PER_CHAR
;
1754 if ( dstLen
< srcLen
)
1755 return wxCONV_FAILED
;
1757 memcpy(dst
, src
, srcLen
);
1763 // ----------------------------------------------------------------------------
1764 // endian-reversing conversions
1765 // ----------------------------------------------------------------------------
1768 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1769 const char *src
, size_t srcLen
) const
1771 srcLen
= GetLength(src
, srcLen
);
1772 if ( srcLen
== wxNO_LEN
)
1773 return wxCONV_FAILED
;
1775 srcLen
/= BYTES_PER_CHAR
;
1779 if ( dstLen
< srcLen
)
1780 return wxCONV_FAILED
;
1782 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1783 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1785 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1793 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1794 const wchar_t *src
, size_t srcLen
) const
1796 if ( srcLen
== wxNO_LEN
)
1797 srcLen
= wxWcslen(src
) + 1;
1799 srcLen
*= BYTES_PER_CHAR
;
1803 if ( dstLen
< srcLen
)
1804 return wxCONV_FAILED
;
1806 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1807 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1809 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1816 #endif // WC_UTF16/!WC_UTF16
1819 // ============================================================================
1820 // The classes doing conversion using the iconv_xxx() functions
1821 // ============================================================================
1825 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1826 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1827 // (unless there's yet another bug in glibc) the only case when iconv()
1828 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1829 // left in the input buffer -- when _real_ error occurs,
1830 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1832 // [This bug does not appear in glibc 2.2.]
1833 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1834 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1835 (errno != E2BIG || bufLeft != 0))
1837 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1840 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1842 #define ICONV_T_INVALID ((iconv_t)-1)
1844 #if SIZEOF_WCHAR_T == 4
1845 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1846 #define WC_ENC wxFONTENCODING_UTF32
1847 #elif SIZEOF_WCHAR_T == 2
1848 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1849 #define WC_ENC wxFONTENCODING_UTF16
1850 #else // sizeof(wchar_t) != 2 nor 4
1851 // does this ever happen?
1852 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1855 // ----------------------------------------------------------------------------
1856 // wxMBConv_iconv: encapsulates an iconv character set
1857 // ----------------------------------------------------------------------------
1859 class wxMBConv_iconv
: public wxMBConv
1862 wxMBConv_iconv(const char *name
);
1863 virtual ~wxMBConv_iconv();
1865 // implement base class virtual methods
1866 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
1867 const char *src
, size_t srcLen
= wxNO_LEN
) const;
1868 virtual size_t FromWChar(char *dst
, size_t dstLen
,
1869 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
1870 virtual size_t GetMBNulLen() const;
1872 #if wxUSE_UNICODE_UTF8
1873 virtual bool IsUTF8() const;
1876 virtual wxMBConv
*Clone() const
1878 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
1879 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1884 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1887 // the iconv handlers used to translate from multibyte
1888 // to wide char and in the other direction
1893 // guards access to m2w and w2m objects
1894 wxMutex m_iconvMutex
;
1898 // the name (for iconv_open()) of a wide char charset -- if none is
1899 // available on this machine, it will remain NULL
1900 static wxString ms_wcCharsetName
;
1902 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1903 // different endian-ness than the native one
1904 static bool ms_wcNeedsSwap
;
1907 // name of the encoding handled by this conversion
1910 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1912 size_t m_minMBCharWidth
;
1915 // make the constructor available for unit testing
1916 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
1918 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
1919 if ( !result
->IsOk() )
1928 wxString
wxMBConv_iconv::ms_wcCharsetName
;
1929 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
1931 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
1934 m_minMBCharWidth
= 0;
1936 // check for charset that represents wchar_t:
1937 if ( ms_wcCharsetName
.empty() )
1939 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
1942 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
1943 #else // !wxUSE_FONTMAP
1944 static const wxChar
*names_static
[] =
1946 #if SIZEOF_WCHAR_T == 4
1948 #elif SIZEOF_WCHAR_T = 2
1953 const wxChar
**names
= names_static
;
1954 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1956 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
1958 const wxString
nameCS(*names
);
1960 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1961 wxString
nameXE(nameCS
);
1963 #ifdef WORDS_BIGENDIAN
1965 #else // little endian
1969 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1972 m2w
= iconv_open(nameXE
.ToAscii(), name
);
1973 if ( m2w
== ICONV_T_INVALID
)
1975 // try charset w/o bytesex info (e.g. "UCS4")
1976 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
1978 m2w
= iconv_open(nameCS
.ToAscii(), name
);
1980 // and check for bytesex ourselves:
1981 if ( m2w
!= ICONV_T_INVALID
)
1983 char buf
[2], *bufPtr
;
1992 outsz
= SIZEOF_WCHAR_T
* 2;
1993 char* wbufPtr
= (char*)wbuf
;
1997 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2000 if (ICONV_FAILED(res
, insz
))
2002 wxLogLastError(wxT("iconv"));
2003 wxLogError(_("Conversion to charset '%s' doesn't work."),
2006 else // ok, can convert to this encoding, remember it
2008 ms_wcCharsetName
= nameCS
;
2009 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2013 else // use charset not requiring byte swapping
2015 ms_wcCharsetName
= nameXE
;
2019 wxLogTrace(TRACE_STRCONV
,
2020 wxT("iconv wchar_t charset is \"%s\"%s"),
2021 ms_wcCharsetName
.empty() ? wxString("<none>")
2023 ms_wcNeedsSwap
? _T(" (needs swap)")
2026 else // we already have ms_wcCharsetName
2028 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2031 if ( ms_wcCharsetName
.empty() )
2033 w2m
= ICONV_T_INVALID
;
2037 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2038 if ( w2m
== ICONV_T_INVALID
)
2040 wxLogTrace(TRACE_STRCONV
,
2041 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2042 ms_wcCharsetName
.c_str(), name
);
2047 wxMBConv_iconv::~wxMBConv_iconv()
2049 if ( m2w
!= ICONV_T_INVALID
)
2051 if ( w2m
!= ICONV_T_INVALID
)
2056 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2057 const char *src
, size_t srcLen
) const
2059 if ( srcLen
== wxNO_LEN
)
2061 // find the string length: notice that must be done differently for
2062 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2064 const size_t nulLen
= GetMBNulLen();
2068 return wxCONV_FAILED
;
2071 srcLen
= strlen(src
); // arguably more optimized than our version
2076 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2077 // but they also have to start at character boundary and not
2078 // span two adjacent characters
2080 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2086 // when we're determining the length of the string ourselves we count
2087 // the terminating NUL(s) as part of it and always NUL-terminate the
2092 // we express length in the number of (wide) characters but iconv always
2093 // counts buffer sizes it in bytes
2094 dstLen
*= SIZEOF_WCHAR_T
;
2097 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2098 // Unfortunately there are a couple of global wxCSConv objects such as
2099 // wxConvLocal that are used all over wx code, so we have to make sure
2100 // the handle is used by at most one thread at the time. Otherwise
2101 // only a few wx classes would be safe to use from non-main threads
2102 // as MB<->WC conversion would fail "randomly".
2103 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2104 #endif // wxUSE_THREADS
2107 const char *pszPtr
= src
;
2111 char* bufPtr
= (char*)dst
;
2113 // have destination buffer, convert there
2114 size_t dstLenOrig
= dstLen
;
2116 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2119 // convert the number of bytes converted as returned by iconv to the
2120 // number of (wide) characters converted that we need
2121 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2125 // convert to native endianness
2126 for ( unsigned i
= 0; i
< res
; i
++ )
2127 dst
[i
] = WC_BSWAP(dst
[i
]);
2130 else // no destination buffer
2132 // convert using temp buffer to calculate the size of the buffer needed
2138 char* bufPtr
= (char*)tbuf
;
2139 dstLen
= 8 * SIZEOF_WCHAR_T
;
2142 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2145 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2147 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2150 if (ICONV_FAILED(cres
, srcLen
))
2152 //VS: it is ok if iconv fails, hence trace only
2153 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2154 return wxCONV_FAILED
;
2160 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2161 const wchar_t *src
, size_t srcLen
) const
2164 // NB: explained in MB2WC
2165 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2168 if ( srcLen
== wxNO_LEN
)
2169 srcLen
= wxWcslen(src
) + 1;
2171 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2172 size_t outbuflen
= dstLen
;
2175 wchar_t *tmpbuf
= 0;
2179 // need to copy to temp buffer to switch endianness
2180 // (doing WC_BSWAP twice on the original buffer won't help, as it
2181 // could be in read-only memory, or be accessed in some other thread)
2182 tmpbuf
= (wchar_t *)malloc(inbuflen
+ SIZEOF_WCHAR_T
);
2183 for ( size_t i
= 0; i
< srcLen
; i
++ )
2184 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2186 tmpbuf
[srcLen
] = L
'\0';
2190 char* inbuf
= (char*)src
;
2193 // have destination buffer, convert there
2194 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2196 res
= dstLen
- outbuflen
;
2198 else // no destination buffer
2200 // convert using temp buffer to calculate the size of the buffer needed
2208 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2210 res
+= 16 - outbuflen
;
2212 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2220 if (ICONV_FAILED(cres
, inbuflen
))
2222 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2223 return wxCONV_FAILED
;
2229 size_t wxMBConv_iconv::GetMBNulLen() const
2231 if ( m_minMBCharWidth
== 0 )
2233 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2236 // NB: explained in MB2WC
2237 wxMutexLocker
lock(self
->m_iconvMutex
);
2240 const wchar_t *wnul
= L
"";
2241 char buf
[8]; // should be enough for NUL in any encoding
2242 size_t inLen
= sizeof(wchar_t),
2243 outLen
= WXSIZEOF(buf
);
2244 char *inBuff
= (char *)wnul
;
2245 char *outBuff
= buf
;
2246 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2248 self
->m_minMBCharWidth
= (size_t)-1;
2252 self
->m_minMBCharWidth
= outBuff
- buf
;
2256 return m_minMBCharWidth
;
2259 #if wxUSE_UNICODE_UTF8
2260 bool wxMBConv_iconv::IsUTF8() const
2262 return wxStricmp(m_name
, "UTF-8") == 0 ||
2263 wxStricmp(m_name
, "UTF8") == 0;
2267 #endif // HAVE_ICONV
2270 // ============================================================================
2271 // Win32 conversion classes
2272 // ============================================================================
2274 #ifdef wxHAVE_WIN32_MB2WC
2278 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2279 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2282 class wxMBConv_win32
: public wxMBConv
2287 m_CodePage
= CP_ACP
;
2288 m_minMBCharWidth
= 0;
2291 wxMBConv_win32(const wxMBConv_win32
& conv
)
2294 m_CodePage
= conv
.m_CodePage
;
2295 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2299 wxMBConv_win32(const char* name
)
2301 m_CodePage
= wxCharsetToCodepage(name
);
2302 m_minMBCharWidth
= 0;
2305 wxMBConv_win32(wxFontEncoding encoding
)
2307 m_CodePage
= wxEncodingToCodepage(encoding
);
2308 m_minMBCharWidth
= 0;
2310 #endif // wxUSE_FONTMAP
2312 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2314 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2315 // the behaviour is not compatible with the Unix version (using iconv)
2316 // and break the library itself, e.g. wxTextInputStream::NextChar()
2317 // wouldn't work if reading an incomplete MB char didn't result in an
2320 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2321 // Win XP or newer and it is not supported for UTF-[78] so we always
2322 // use our own conversions in this case. See
2323 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2324 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2325 if ( m_CodePage
== CP_UTF8
)
2327 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2330 if ( m_CodePage
== CP_UTF7
)
2332 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2336 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2337 IsAtLeastWin2kSP4() )
2339 flags
= MB_ERR_INVALID_CHARS
;
2342 const size_t len
= ::MultiByteToWideChar
2344 m_CodePage
, // code page
2345 flags
, // flags: fall on error
2346 psz
, // input string
2347 -1, // its length (NUL-terminated)
2348 buf
, // output string
2349 buf
? n
: 0 // size of output buffer
2353 // function totally failed
2354 return wxCONV_FAILED
;
2357 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2358 // check if we succeeded, by doing a double trip:
2359 if ( !flags
&& buf
)
2361 const size_t mbLen
= strlen(psz
);
2362 wxCharBuffer
mbBuf(mbLen
);
2363 if ( ::WideCharToMultiByte
2370 mbLen
+ 1, // size in bytes, not length
2374 strcmp(mbBuf
, psz
) != 0 )
2376 // we didn't obtain the same thing we started from, hence
2377 // the conversion was lossy and we consider that it failed
2378 return wxCONV_FAILED
;
2382 // note that it returns count of written chars for buf != NULL and size
2383 // of the needed buffer for buf == NULL so in either case the length of
2384 // the string (which never includes the terminating NUL) is one less
2388 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2391 we have a problem here: by default, WideCharToMultiByte() may
2392 replace characters unrepresentable in the target code page with bad
2393 quality approximations such as turning "1/2" symbol (U+00BD) into
2394 "1" for the code pages which don't have it and we, obviously, want
2395 to avoid this at any price
2397 the trouble is that this function does it _silently_, i.e. it won't
2398 even tell us whether it did or not... Win98/2000 and higher provide
2399 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2400 we have to resort to a round trip, i.e. check that converting back
2401 results in the same string -- this is, of course, expensive but
2402 otherwise we simply can't be sure to not garble the data.
2405 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2406 // it doesn't work with CJK encodings (which we test for rather roughly
2407 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2409 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2412 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2414 // it's our lucky day
2415 flags
= WC_NO_BEST_FIT_CHARS
;
2416 pUsedDef
= &usedDef
;
2418 else // old system or unsupported encoding
2424 const size_t len
= ::WideCharToMultiByte
2426 m_CodePage
, // code page
2427 flags
, // either none or no best fit
2428 pwz
, // input string
2429 -1, // it is (wide) NUL-terminated
2430 buf
, // output buffer
2431 buf
? n
: 0, // and its size
2432 NULL
, // default "replacement" char
2433 pUsedDef
// [out] was it used?
2438 // function totally failed
2439 return wxCONV_FAILED
;
2442 // we did something, check if we really succeeded
2445 // check if the conversion failed, i.e. if any replacements
2448 return wxCONV_FAILED
;
2450 else // we must resort to double tripping...
2452 // first we need to ensure that we really have the MB data: this is
2453 // not the case if we're called with NULL buffer, in which case we
2454 // need to do the conversion yet again
2455 wxCharBuffer bufDef
;
2458 bufDef
= wxCharBuffer(len
);
2459 buf
= bufDef
.data();
2460 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2461 buf
, len
, NULL
, NULL
) )
2462 return wxCONV_FAILED
;
2467 wxWCharBuffer
wcBuf(n
);
2468 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2469 wcscmp(wcBuf
, pwz
) != 0 )
2471 // we didn't obtain the same thing we started from, hence
2472 // the conversion was lossy and we consider that it failed
2473 return wxCONV_FAILED
;
2477 // see the comment above for the reason of "len - 1"
2481 virtual size_t GetMBNulLen() const
2483 if ( m_minMBCharWidth
== 0 )
2485 int len
= ::WideCharToMultiByte
2487 m_CodePage
, // code page
2489 L
"", // input string
2490 1, // translate just the NUL
2491 NULL
, // output buffer
2493 NULL
, // no replacement char
2494 NULL
// [out] don't care if it was used
2497 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2501 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2502 self
->m_minMBCharWidth
= (size_t)-1;
2506 self
->m_minMBCharWidth
= (size_t)-1;
2512 self
->m_minMBCharWidth
= len
;
2517 return m_minMBCharWidth
;
2520 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2522 bool IsOk() const { return m_CodePage
!= -1; }
2525 static bool CanUseNoBestFit()
2527 static int s_isWin98Or2k
= -1;
2529 if ( s_isWin98Or2k
== -1 )
2532 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2534 case wxOS_WINDOWS_9X
:
2535 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2538 case wxOS_WINDOWS_NT
:
2539 s_isWin98Or2k
= verMaj
>= 5;
2543 // unknown: be conservative by default
2548 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2551 return s_isWin98Or2k
== 1;
2554 static bool IsAtLeastWin2kSP4()
2559 static int s_isAtLeastWin2kSP4
= -1;
2561 if ( s_isAtLeastWin2kSP4
== -1 )
2563 OSVERSIONINFOEX ver
;
2565 memset(&ver
, 0, sizeof(ver
));
2566 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2567 GetVersionEx((OSVERSIONINFO
*)&ver
);
2569 s_isAtLeastWin2kSP4
=
2570 ((ver
.dwMajorVersion
> 5) || // Vista+
2571 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2572 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2573 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2577 return s_isAtLeastWin2kSP4
== 1;
2582 // the code page we're working with
2585 // cached result of GetMBNulLen(), set to 0 initially meaning
2587 size_t m_minMBCharWidth
;
2590 #endif // wxHAVE_WIN32_MB2WC
2593 // ============================================================================
2594 // wxEncodingConverter based conversion classes
2595 // ============================================================================
2599 class wxMBConv_wxwin
: public wxMBConv
2604 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2605 // The wxMBConv_cf class does a better job.
2606 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2607 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2608 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2612 // temporarily just use wxEncodingConverter stuff,
2613 // so that it works while a better implementation is built
2614 wxMBConv_wxwin(const char* name
)
2617 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2619 m_enc
= wxFONTENCODING_SYSTEM
;
2624 wxMBConv_wxwin(wxFontEncoding enc
)
2631 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2633 size_t inbuf
= strlen(psz
);
2636 if (!m2w
.Convert(psz
, buf
))
2637 return wxCONV_FAILED
;
2642 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2644 const size_t inbuf
= wxWcslen(psz
);
2647 if (!w2m
.Convert(psz
, buf
))
2648 return wxCONV_FAILED
;
2654 virtual size_t GetMBNulLen() const
2658 case wxFONTENCODING_UTF16BE
:
2659 case wxFONTENCODING_UTF16LE
:
2662 case wxFONTENCODING_UTF32BE
:
2663 case wxFONTENCODING_UTF32LE
:
2671 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2673 bool IsOk() const { return m_ok
; }
2676 wxFontEncoding m_enc
;
2677 wxEncodingConverter m2w
, w2m
;
2680 // were we initialized successfully?
2683 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2686 // make the constructors available for unit testing
2687 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2689 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2690 if ( !result
->IsOk() )
2699 #endif // wxUSE_FONTMAP
2701 // ============================================================================
2702 // wxCSConv implementation
2703 // ============================================================================
2705 void wxCSConv::Init()
2712 wxCSConv::wxCSConv(const wxString
& charset
)
2716 if ( !charset
.empty() )
2718 SetName(charset
.ToAscii());
2722 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2724 m_encoding
= wxFONTENCODING_SYSTEM
;
2728 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2730 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2732 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2734 encoding
= wxFONTENCODING_SYSTEM
;
2739 m_encoding
= encoding
;
2742 wxCSConv::~wxCSConv()
2747 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2752 SetName(conv
.m_name
);
2753 m_encoding
= conv
.m_encoding
;
2756 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2760 SetName(conv
.m_name
);
2761 m_encoding
= conv
.m_encoding
;
2766 void wxCSConv::Clear()
2775 void wxCSConv::SetName(const char *charset
)
2779 m_name
= wxStrdup(charset
);
2786 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2787 wxEncodingNameCache
);
2789 static wxEncodingNameCache gs_nameCache
;
2792 wxMBConv
*wxCSConv::DoCreate() const
2795 wxLogTrace(TRACE_STRCONV
,
2796 wxT("creating conversion for %s"),
2798 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2799 #endif // wxUSE_FONTMAP
2801 // check for the special case of ASCII or ISO8859-1 charset: as we have
2802 // special knowledge of it anyhow, we don't need to create a special
2803 // conversion object
2804 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2805 m_encoding
== wxFONTENCODING_DEFAULT
)
2807 // don't convert at all
2811 // we trust OS to do conversion better than we can so try external
2812 // conversion methods first
2814 // the full order is:
2815 // 1. OS conversion (iconv() under Unix or Win32 API)
2816 // 2. hard coded conversions for UTF
2817 // 3. wxEncodingConverter as fall back
2823 #endif // !wxUSE_FONTMAP
2826 wxFontEncoding
encoding(m_encoding
);
2831 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2839 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2840 #endif // wxUSE_FONTMAP
2844 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2845 if ( it
!= gs_nameCache
.end() )
2847 if ( it
->second
.empty() )
2850 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
2857 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2858 // CS : in case this does not return valid names (eg for MacRoman)
2859 // encoding got a 'failure' entry in the cache all the same,
2860 // although it just has to be created using a different method, so
2861 // only store failed iconv creation attempts (or perhaps we
2862 // shoulnd't do this at all ?)
2863 if ( names
[0] != NULL
)
2865 for ( ; *names
; ++names
)
2867 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2868 // will need changes that will obsolete this
2869 wxString
name(*names
);
2870 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
2873 gs_nameCache
[encoding
] = *names
;
2880 gs_nameCache
[encoding
] = _T(""); // cache the failure
2883 #endif // wxUSE_FONTMAP
2885 #endif // HAVE_ICONV
2887 #ifdef wxHAVE_WIN32_MB2WC
2890 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
2891 : new wxMBConv_win32(m_encoding
);
2900 #endif // wxHAVE_WIN32_MB2WC
2904 // leave UTF16 and UTF32 to the built-ins of wx
2905 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
2906 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
2909 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
2910 : new wxMBConv_cf(m_encoding
);
2912 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
2921 #endif // __DARWIN__
2924 wxFontEncoding enc
= m_encoding
;
2926 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
2928 // use "false" to suppress interactive dialogs -- we can be called from
2929 // anywhere and popping up a dialog from here is the last thing we want to
2931 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2933 #endif // wxUSE_FONTMAP
2937 case wxFONTENCODING_UTF7
:
2938 return new wxMBConvUTF7
;
2940 case wxFONTENCODING_UTF8
:
2941 return new wxMBConvUTF8
;
2943 case wxFONTENCODING_UTF16BE
:
2944 return new wxMBConvUTF16BE
;
2946 case wxFONTENCODING_UTF16LE
:
2947 return new wxMBConvUTF16LE
;
2949 case wxFONTENCODING_UTF32BE
:
2950 return new wxMBConvUTF32BE
;
2952 case wxFONTENCODING_UTF32LE
:
2953 return new wxMBConvUTF32LE
;
2956 // nothing to do but put here to suppress gcc warnings
2963 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
2964 : new wxMBConv_wxwin(m_encoding
);
2970 #endif // wxUSE_FONTMAP
2972 // NB: This is a hack to prevent deadlock. What could otherwise happen
2973 // in Unicode build: wxConvLocal creation ends up being here
2974 // because of some failure and logs the error. But wxLog will try to
2975 // attach a timestamp, for which it will need wxConvLocal (to convert
2976 // time to char* and then wchar_t*), but that fails, tries to log the
2977 // error, but wxLog has an (already locked) critical section that
2978 // guards the static buffer.
2979 static bool alreadyLoggingError
= false;
2980 if (!alreadyLoggingError
)
2982 alreadyLoggingError
= true;
2983 wxLogError(_("Cannot convert from the charset '%s'!"),
2987 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
2988 #else // !wxUSE_FONTMAP
2989 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
2990 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2993 alreadyLoggingError
= false;
2999 void wxCSConv::CreateConvIfNeeded() const
3003 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3005 // if we don't have neither the name nor the encoding, use the default
3006 // encoding for this system
3007 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3010 self
->m_encoding
= wxLocale::GetSystemEncoding();
3012 // fallback to some reasonable default:
3013 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3014 #endif // wxUSE_INTL
3017 self
->m_convReal
= DoCreate();
3018 self
->m_deferred
= false;
3022 bool wxCSConv::IsOk() const
3024 CreateConvIfNeeded();
3026 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3027 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3028 return true; // always ok as we do it ourselves
3030 // m_convReal->IsOk() is called at its own creation, so we know it must
3031 // be ok if m_convReal is non-NULL
3032 return m_convReal
!= NULL
;
3035 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3036 const char *src
, size_t srcLen
) const
3038 CreateConvIfNeeded();
3041 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3044 return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
);
3047 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3048 const wchar_t *src
, size_t srcLen
) const
3050 CreateConvIfNeeded();
3053 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3056 return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
);
3059 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3061 CreateConvIfNeeded();
3064 return m_convReal
->MB2WC(buf
, psz
, n
);
3067 size_t len
= strlen(psz
);
3071 for (size_t c
= 0; c
<= len
; c
++)
3072 buf
[c
] = (unsigned char)(psz
[c
]);
3078 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3080 CreateConvIfNeeded();
3083 return m_convReal
->WC2MB(buf
, psz
, n
);
3086 const size_t len
= wxWcslen(psz
);
3089 for (size_t c
= 0; c
<= len
; c
++)
3092 return wxCONV_FAILED
;
3094 buf
[c
] = (char)psz
[c
];
3099 for (size_t c
= 0; c
<= len
; c
++)
3102 return wxCONV_FAILED
;
3109 size_t wxCSConv::GetMBNulLen() const
3111 CreateConvIfNeeded();
3115 return m_convReal
->GetMBNulLen();
3118 // otherwise, we are ISO-8859-1
3122 #if wxUSE_UNICODE_UTF8
3123 bool wxCSConv::IsUTF8() const
3125 CreateConvIfNeeded();
3129 return m_convReal
->IsUTF8();
3132 // otherwise, we are ISO-8859-1
3140 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3143 return wxWCharBuffer();
3145 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3147 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3149 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3154 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3157 return wxCharBuffer();
3159 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3161 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3166 #endif // wxUSE_UNICODE
3168 // ----------------------------------------------------------------------------
3170 // ----------------------------------------------------------------------------
3172 // NB: The reason why we create converted objects in this convoluted way,
3173 // using a factory function instead of global variable, is that they
3174 // may be used at static initialization time (some of them are used by
3175 // wxString ctors and there may be a global wxString object). In other
3176 // words, possibly _before_ the converter global object would be
3183 #undef wxConvISO8859_1
3185 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3186 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3187 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3189 static impl_klass name##Obj ctor_args; \
3190 return &name##Obj; \
3192 /* this ensures that all global converter objects are created */ \
3193 /* by the time static initialization is done, i.e. before any */ \
3194 /* thread is launched: */ \
3195 static klass* gs_##name##instance = wxGet_##name##Ptr()
3197 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3198 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3201 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3203 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3206 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3207 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3208 // provokes an error message about "not enough macro parameters"; and we
3209 // can't use "()" here as the name##Obj declaration would be parsed as a
3210 // function declaration then, so use a semicolon and live with an extra
3211 // empty statement (and hope that no compilers warns about this)
3212 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3213 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3215 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3216 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3218 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3219 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3222 // The xnu kernel always communicates file paths in decomposed UTF-8.
3223 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3224 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3227 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3230 #else // !__DARWIN__
3231 wxGet_wxConvLibcPtr();
3232 #endif // __DARWIN__/!__DARWIN__
3234 #else // !wxUSE_WCHAR_T
3236 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3237 // stand-ins in absence of wchar_t
3238 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3243 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T