1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV _T("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten
= 0;
176 // the number of NULs terminating this string
177 size_t nulLen
= 0; // not really needed, but just to avoid warnings
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
185 if ( srcLen
!= wxNO_LEN
)
187 // we need to know how to find the end of this string
188 nulLen
= GetMBNulLen();
189 if ( nulLen
== wxCONV_FAILED
)
190 return wxCONV_FAILED
;
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
197 char * const p
= bufTmp
.data();
198 memcpy(p
, src
, srcLen
);
199 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
205 srcEnd
= src
+ srcLen
;
207 else // quit after the first loop iteration
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
229 // try to convert the current chunk
230 size_t lenChunk
= MB2WC(NULL
, src
, 0);
231 if ( lenChunk
== wxCONV_FAILED
)
232 return wxCONV_FAILED
;
234 dstWritten
+= lenChunk
;
240 // nothing left in the input string, conversion succeeded
246 if ( dstWritten
> dstLen
)
247 return wxCONV_FAILED
;
249 // +1 is for trailing NUL
250 if ( MB2WC(dst
, src
, lenChunk
+ 1) == wxCONV_FAILED
)
251 return wxCONV_FAILED
;
260 // we convert just one chunk in this case as this is the entire
265 // advance the input pointer past the end of this chunk
266 while ( NotAllNULs(src
, nulLen
) )
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
275 src
+= nulLen
; // skipping over its terminator as well
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
279 // delimited by srcEnd
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
298 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
299 const wchar_t *src
, size_t srcLen
) const
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten
= 0;
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
309 // make a copy of the input string unless it is already properly
311 wxWCharBuffer bufTmp
;
312 if ( isNulTerminated
)
314 srcLen
= wxWcslen(src
) + 1;
316 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
318 // make a copy in order to properly NUL-terminate the string
319 bufTmp
= wxWCharBuffer(srcLen
);
320 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
324 const size_t lenNul
= GetMBNulLen();
325 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
327 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
329 // try to convert the current chunk
330 size_t lenChunk
= WC2MB(NULL
, src
, 0);
332 if ( lenChunk
== wxCONV_FAILED
)
333 return wxCONV_FAILED
;
335 dstWritten
+= lenChunk
;
336 if ( isNulTerminated
)
337 dstWritten
+= lenNul
;
341 if ( dstWritten
> dstLen
)
342 return wxCONV_FAILED
;
344 if ( WC2MB(dst
, src
, lenChunk
+ lenNul
) == wxCONV_FAILED
)
345 return wxCONV_FAILED
;
348 if ( isNulTerminated
)
356 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
358 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
359 if ( rc
!= wxCONV_FAILED
)
361 // ToWChar() returns the buffer length, i.e. including the trailing
362 // NUL, while this method doesn't take it into account
369 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
371 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
372 if ( rc
!= wxCONV_FAILED
)
380 wxMBConv::~wxMBConv()
382 // nothing to do here (necessary for Darwin linking probably)
385 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
389 // calculate the length of the buffer needed first
390 const size_t nLen
= ToWChar(NULL
, 0, psz
);
391 if ( nLen
!= wxCONV_FAILED
)
393 // now do the actual conversion
394 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
396 // +1 for the trailing NULL
397 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
402 return wxWCharBuffer();
405 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
409 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
410 if ( nLen
!= wxCONV_FAILED
)
412 wxCharBuffer
buf(nLen
- 1);
413 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
418 return wxCharBuffer();
422 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
424 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
425 if ( dstLen
!= wxCONV_FAILED
)
427 // notice that we allocate space for dstLen+1 wide characters here
428 // because we want the buffer to always be NUL-terminated, even if the
429 // input isn't (as otherwise the caller has no way to know its length)
430 wxWCharBuffer
wbuf(dstLen
);
431 wbuf
.data()[dstLen
] = L
'\0';
432 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
438 // we also need to handle NUL-terminated input strings
439 // specially: for them the output is the length of the string
440 // excluding the trailing NUL, however if we're asked to
441 // convert a specific number of characters we return the length
442 // of the resulting output even if it's NUL-terminated
443 if ( inLen
== wxNO_LEN
)
454 return wxWCharBuffer();
458 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
460 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
461 if ( dstLen
!= wxCONV_FAILED
)
463 const size_t nulLen
= GetMBNulLen();
465 // as above, ensure that the buffer is always NUL-terminated, even if
467 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
468 memset(buf
.data() + dstLen
, 0, nulLen
);
469 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
475 if ( inLen
== wxNO_LEN
)
477 // in this case both input and output are NUL-terminated
478 // and we're not supposed to count NUL
490 return wxCharBuffer();
493 // ----------------------------------------------------------------------------
495 // ----------------------------------------------------------------------------
497 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
499 return wxMB2WC(buf
, psz
, n
);
502 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
504 return wxWC2MB(buf
, psz
, n
);
507 // ----------------------------------------------------------------------------
508 // wxConvBrokenFileNames
509 // ----------------------------------------------------------------------------
513 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
515 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
516 wxStricmp(charset
, _T("UTF8")) == 0 )
517 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
519 m_conv
= new wxCSConv(charset
);
524 // ----------------------------------------------------------------------------
526 // ----------------------------------------------------------------------------
528 // Implementation (C) 2004 Fredrik Roubert
530 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
533 // BASE64 decoding table
535 static const unsigned char utf7unb64
[] =
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
540 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
541 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
542 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
543 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
544 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
545 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
546 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
547 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
548 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
549 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
550 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
551 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
552 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
553 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
554 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
555 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
556 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
557 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
558 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
559 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
560 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
561 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
562 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
563 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
564 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
565 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
566 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
567 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
568 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
571 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
572 const char *src
, size_t srcLen
) const
574 DecoderState stateOrig
,
576 if ( srcLen
== wxNO_LEN
)
578 // convert the entire string, up to and including the trailing NUL
579 srcLen
= strlen(src
) + 1;
581 // when working on the entire strings we don't update nor use the shift
582 // state from the previous call
583 statePtr
= &stateOrig
;
585 else // when working with partial strings we do use the shift state
587 statePtr
= wx_const_cast(DecoderState
*, &m_stateDecoder
);
589 // also save the old state to be able to rollback to it on error
590 stateOrig
= m_stateDecoder
;
593 // but to simplify the code below we use this variable in both cases
594 DecoderState
& state
= *statePtr
;
597 // number of characters [which would have been] written to dst [if it were
601 const char * const srcEnd
= src
+ srcLen
;
603 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
605 const unsigned char cc
= *src
++;
607 if ( state
.IsShifted() )
609 const unsigned char dc
= utf7unb64
[cc
];
612 // end of encoded part, check that nothing was left: the bit
613 // field cycles through 0,6,4,2 sequence so check that we're at
615 if ( state
.bit
!= 2 )
616 return wxCONV_FAILED
;
620 // re-parse this character normally below unless it's '-' which
621 // is consumed by the decoder
625 else // valid encoded character
627 // mini base64 decoder: each character is 6 bits
632 if ( state
.bit
>= 8 )
634 // got the full byte, consume it
636 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
640 // we've got the full word, output it
642 *dst
++ = (state
.msb
<< 8) | b
;
648 // just store it while we wait for LSB
656 if ( state
.IsDirect() )
658 // start of an encoded segment?
663 // just the encoded plus sign, don't switch to shifted mode
676 // only printable 7 bit ASCII characters (with the exception of
677 // NUL, TAB, CR and LF) can be used directly
678 if ( cc
>= 0x7f || (cc
< ' ' &&
679 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
680 return wxCONV_FAILED
;
691 // as we didn't read any characters we should be called with the same
692 // data (followed by some more new data) again later so don't save our
696 return wxCONV_FAILED
;
703 // BASE64 encoding table
705 static const unsigned char utf7enb64
[] =
707 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
708 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
709 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
710 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
711 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
712 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
713 'w', 'x', 'y', 'z', '0', '1', '2', '3',
714 '4', '5', '6', '7', '8', '9', '+', '/'
718 // UTF-7 encoding table
720 // 0 - Set D (directly encoded characters)
721 // 1 - Set O (optional direct characters)
722 // 2 - whitespace characters (optional)
723 // 3 - special characters
725 static const unsigned char utf7encode
[128] =
727 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
728 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
729 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
731 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
733 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
734 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
737 static inline bool wxIsUTF7Direct(wchar_t wc
)
739 return wc
< 0x80 && utf7encode
[wc
] < 1;
742 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
743 const wchar_t *src
, size_t srcLen
) const
745 EncoderState stateOrig
,
747 if ( srcLen
== wxNO_LEN
)
749 // we don't apply the stored state when operating on entire strings at
751 statePtr
= &stateOrig
;
753 srcLen
= wxWcslen(src
) + 1;
755 else // do use the mode we left the output in previously
757 stateOrig
= m_stateEncoder
;
758 statePtr
= wx_const_cast(EncoderState
*, &m_stateEncoder
);
761 EncoderState
& state
= *statePtr
;
766 const wchar_t * const srcEnd
= src
+ srcLen
;
767 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
770 if ( wxIsUTF7Direct(cc
) )
772 if ( state
.IsShifted() )
774 // pad with zeros the last encoded block if necessary
778 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
793 else if ( cc
== '+' && state
.IsDirect() )
804 else if (((wxUint32
)cc
) > 0xffff)
806 // no surrogate pair generation (yet?)
807 return wxCONV_FAILED
;
812 if ( state
.IsDirect() )
821 // BASE64 encode string
824 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
827 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
829 for (state
.bit
+= 8; state
.bit
>= 6; )
833 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
838 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
846 // we need to restore the original encoder state if we were called just to
847 // calculate the amount of space needed as we will presumably be called
848 // again to really convert the data now
855 // ----------------------------------------------------------------------------
857 // ----------------------------------------------------------------------------
859 static const wxUint32 utf8_max
[]=
860 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
862 // boundaries of the private use area we use to (temporarily) remap invalid
863 // characters invalid in a UTF-8 encoded string
864 const wxUint32 wxUnicodePUA
= 0x100000;
865 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
867 // this table gives the length of the UTF-8 encoding from its first character:
868 const unsigned char tableUtf8Lengths
[256] = {
869 // single-byte sequences (ASCII):
870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
876 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
877 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
879 // these are invalid:
880 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
881 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
882 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
883 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
886 // two-byte sequences:
887 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
888 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
890 // three-byte sequences:
891 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
893 // four-byte sequences:
894 4, 4, 4, 4, 4, // F0..F4
896 // these are invalid again (5- or 6-byte
897 // sequences and sequences for code points
898 // above U+10FFFF, as restricted by RFC 3629):
899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
903 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
904 const char *src
, size_t srcLen
) const
906 wchar_t *out
= dstLen
? dst
: NULL
;
909 if ( srcLen
== wxNO_LEN
)
910 srcLen
= strlen(src
) + 1;
912 for ( const char *p
= src
; ; p
++ )
914 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
916 // all done successfully, just add the trailing NULL if we are not
917 // using explicit length
918 if ( srcLen
== wxNO_LEN
)
934 if ( out
&& !dstLen
-- )
938 unsigned char c
= *p
;
942 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
945 if ( srcLen
!= wxNO_LEN
)
952 unsigned len
= tableUtf8Lengths
[c
];
956 if ( srcLen
< len
) // the test works for wxNO_LEN too
959 if ( srcLen
!= wxNO_LEN
)
962 // Char. number range | UTF-8 octet sequence
963 // (hexadecimal) | (binary)
964 // ----------------------+----------------------------------------
965 // 0000 0000 - 0000 007F | 0xxxxxxx
966 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
967 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
968 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
970 // Code point value is stored in bits marked with 'x',
971 // lowest-order bit of the value on the right side in the diagram
972 // above. (from RFC 3629)
974 // mask to extract lead byte's value ('x' bits above), by sequence
976 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
978 // mask and value of lead byte's most significant bits, by length:
979 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
980 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
982 len
--; // it's more convenient to work with 0-based length here
984 // extract the lead byte's value bits:
985 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
988 code
= c
& leadValueMask
[len
];
990 // all remaining bytes, if any, are handled in the same way
991 // regardless of sequence's length:
995 if ( (c
& 0xC0) != 0x80 )
996 return wxCONV_FAILED
;
1004 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1005 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
1014 #endif // WC_UTF16/!WC_UTF16
1022 return wxCONV_FAILED
;
1026 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
1027 const wchar_t *src
, size_t srcLen
) const
1029 char *out
= dstLen
? dst
: NULL
;
1032 for ( const wchar_t *wp
= src
; ; wp
++ )
1034 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
) )
1036 // all done successfully, just add the trailing NULL if we are not
1037 // using explicit length
1038 if ( srcLen
== wxNO_LEN
)
1054 if ( srcLen
!= wxNO_LEN
)
1059 // cast is ok for WC_UTF16
1060 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1062 // skip the next char too as we decoded a surrogate
1065 #else // wchar_t is UTF-32
1066 code
= *wp
& 0x7fffffff;
1078 out
[0] = (char)code
;
1081 else if ( code
<= 0x07FF )
1089 // NB: this line takes 6 least significant bits, encodes them as
1090 // 10xxxxxx and discards them so that the next byte can be encoded:
1091 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1092 out
[0] = 0xC0 | code
;
1095 else if ( code
< 0xFFFF )
1103 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1104 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1105 out
[0] = 0xE0 | code
;
1108 else if ( code
<= 0x10FFFF )
1116 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1117 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1118 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1119 out
[0] = 0xF0 | code
;
1124 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1137 // we only get here if an error occurs during decoding
1138 return wxCONV_FAILED
;
1141 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1142 const char *psz
, size_t srcLen
) const
1144 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1145 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1149 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1151 const char *opsz
= psz
;
1152 bool invalid
= false;
1153 unsigned char cc
= *psz
++, fc
= cc
;
1155 for (cnt
= 0; fc
& 0x80; cnt
++)
1165 // escape the escape character for octal escapes
1166 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1167 && cc
== '\\' && (!buf
|| len
< n
))
1179 // invalid UTF-8 sequence
1184 unsigned ocnt
= cnt
- 1;
1185 wxUint32 res
= cc
& (0x3f >> cnt
);
1189 if ((cc
& 0xC0) != 0x80)
1191 // invalid UTF-8 sequence
1197 res
= (res
<< 6) | (cc
& 0x3f);
1200 if (invalid
|| res
<= utf8_max
[ocnt
])
1202 // illegal UTF-8 encoding
1205 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1206 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1208 // if one of our PUA characters turns up externally
1209 // it must also be treated as an illegal sequence
1210 // (a bit like you have to escape an escape character)
1216 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1217 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1218 if (pa
== wxCONV_FAILED
)
1230 *buf
++ = (wchar_t)res
;
1232 #endif // WC_UTF16/!WC_UTF16
1238 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1240 while (opsz
< psz
&& (!buf
|| len
< n
))
1243 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1244 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1245 wxASSERT(pa
!= wxCONV_FAILED
);
1252 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1258 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1260 while (opsz
< psz
&& (!buf
|| len
< n
))
1262 if ( buf
&& len
+ 3 < n
)
1264 unsigned char on
= *opsz
;
1266 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1267 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1268 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1275 else // MAP_INVALID_UTF8_NOT
1277 return wxCONV_FAILED
;
1283 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1289 static inline bool isoctal(wchar_t wch
)
1291 return L
'0' <= wch
&& wch
<= L
'7';
1294 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1295 const wchar_t *psz
, size_t srcLen
) const
1297 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1298 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1302 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1307 // cast is ok for WC_UTF16
1308 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1309 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1311 cc
= (*psz
++) & 0x7fffffff;
1314 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1315 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1318 *buf
++ = (char)(cc
- wxUnicodePUA
);
1321 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1322 && cc
== L
'\\' && psz
[0] == L
'\\' )
1329 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1331 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1335 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1336 (psz
[1] - L
'0') * 010 +
1346 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1362 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1364 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1370 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1376 // ============================================================================
1378 // ============================================================================
1380 #ifdef WORDS_BIGENDIAN
1381 #define wxMBConvUTF16straight wxMBConvUTF16BE
1382 #define wxMBConvUTF16swap wxMBConvUTF16LE
1384 #define wxMBConvUTF16swap wxMBConvUTF16BE
1385 #define wxMBConvUTF16straight wxMBConvUTF16LE
1389 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1391 if ( srcLen
== wxNO_LEN
)
1393 // count the number of bytes in input, including the trailing NULs
1394 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1395 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1398 srcLen
*= BYTES_PER_CHAR
;
1400 else // we already have the length
1402 // we can only convert an entire number of UTF-16 characters
1403 if ( srcLen
% BYTES_PER_CHAR
)
1404 return wxCONV_FAILED
;
1410 // case when in-memory representation is UTF-16 too
1413 // ----------------------------------------------------------------------------
1414 // conversions without endianness change
1415 // ----------------------------------------------------------------------------
1418 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1419 const char *src
, size_t srcLen
) const
1421 // set up the scene for using memcpy() (which is presumably more efficient
1422 // than copying the bytes one by one)
1423 srcLen
= GetLength(src
, srcLen
);
1424 if ( srcLen
== wxNO_LEN
)
1425 return wxCONV_FAILED
;
1427 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1430 if ( dstLen
< inLen
)
1431 return wxCONV_FAILED
;
1433 memcpy(dst
, src
, srcLen
);
1440 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1441 const wchar_t *src
, size_t srcLen
) const
1443 if ( srcLen
== wxNO_LEN
)
1444 srcLen
= wxWcslen(src
) + 1;
1446 srcLen
*= BYTES_PER_CHAR
;
1450 if ( dstLen
< srcLen
)
1451 return wxCONV_FAILED
;
1453 memcpy(dst
, src
, srcLen
);
1459 // ----------------------------------------------------------------------------
1460 // endian-reversing conversions
1461 // ----------------------------------------------------------------------------
1464 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1465 const char *src
, size_t srcLen
) const
1467 srcLen
= GetLength(src
, srcLen
);
1468 if ( srcLen
== wxNO_LEN
)
1469 return wxCONV_FAILED
;
1471 srcLen
/= BYTES_PER_CHAR
;
1475 if ( dstLen
< srcLen
)
1476 return wxCONV_FAILED
;
1478 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1479 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1481 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1489 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1490 const wchar_t *src
, size_t srcLen
) const
1492 if ( srcLen
== wxNO_LEN
)
1493 srcLen
= wxWcslen(src
) + 1;
1495 srcLen
*= BYTES_PER_CHAR
;
1499 if ( dstLen
< srcLen
)
1500 return wxCONV_FAILED
;
1502 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1503 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1505 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1512 #else // !WC_UTF16: wchar_t is UTF-32
1514 // ----------------------------------------------------------------------------
1515 // conversions without endianness change
1516 // ----------------------------------------------------------------------------
1519 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1520 const char *src
, size_t srcLen
) const
1522 srcLen
= GetLength(src
, srcLen
);
1523 if ( srcLen
== wxNO_LEN
)
1524 return wxCONV_FAILED
;
1526 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1529 // optimization: return maximal space which could be needed for this
1530 // string even if the real size could be smaller if the buffer contains
1536 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1537 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1539 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1541 return wxCONV_FAILED
;
1543 if ( ++outLen
> dstLen
)
1544 return wxCONV_FAILED
;
1554 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1555 const wchar_t *src
, size_t srcLen
) const
1557 if ( srcLen
== wxNO_LEN
)
1558 srcLen
= wxWcslen(src
) + 1;
1561 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1562 for ( size_t n
= 0; n
< srcLen
; n
++ )
1565 const size_t numChars
= encode_utf16(*src
++, cc
);
1566 if ( numChars
== wxCONV_FAILED
)
1567 return wxCONV_FAILED
;
1569 outLen
+= numChars
* BYTES_PER_CHAR
;
1572 if ( outLen
> dstLen
)
1573 return wxCONV_FAILED
;
1576 if ( numChars
== 2 )
1578 // second character of a surrogate
1587 // ----------------------------------------------------------------------------
1588 // endian-reversing conversions
1589 // ----------------------------------------------------------------------------
1592 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1593 const char *src
, size_t srcLen
) const
1595 srcLen
= GetLength(src
, srcLen
);
1596 if ( srcLen
== wxNO_LEN
)
1597 return wxCONV_FAILED
;
1599 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1602 // optimization: return maximal space which could be needed for this
1603 // string even if the real size could be smaller if the buffer contains
1609 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1610 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1615 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1617 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1619 const size_t numChars
= decode_utf16(tmp
, ch
);
1620 if ( numChars
== wxCONV_FAILED
)
1621 return wxCONV_FAILED
;
1623 if ( numChars
== 2 )
1626 if ( ++outLen
> dstLen
)
1627 return wxCONV_FAILED
;
1637 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1638 const wchar_t *src
, size_t srcLen
) const
1640 if ( srcLen
== wxNO_LEN
)
1641 srcLen
= wxWcslen(src
) + 1;
1644 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1645 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1648 const size_t numChars
= encode_utf16(*src
, cc
);
1649 if ( numChars
== wxCONV_FAILED
)
1650 return wxCONV_FAILED
;
1652 outLen
+= numChars
* BYTES_PER_CHAR
;
1655 if ( outLen
> dstLen
)
1656 return wxCONV_FAILED
;
1658 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1659 if ( numChars
== 2 )
1661 // second character of a surrogate
1662 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1670 #endif // WC_UTF16/!WC_UTF16
1673 // ============================================================================
1675 // ============================================================================
1677 #ifdef WORDS_BIGENDIAN
1678 #define wxMBConvUTF32straight wxMBConvUTF32BE
1679 #define wxMBConvUTF32swap wxMBConvUTF32LE
1681 #define wxMBConvUTF32swap wxMBConvUTF32BE
1682 #define wxMBConvUTF32straight wxMBConvUTF32LE
1686 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1687 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1690 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1692 if ( srcLen
== wxNO_LEN
)
1694 // count the number of bytes in input, including the trailing NULs
1695 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1696 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1699 srcLen
*= BYTES_PER_CHAR
;
1701 else // we already have the length
1703 // we can only convert an entire number of UTF-32 characters
1704 if ( srcLen
% BYTES_PER_CHAR
)
1705 return wxCONV_FAILED
;
1711 // case when in-memory representation is UTF-16
1714 // ----------------------------------------------------------------------------
1715 // conversions without endianness change
1716 // ----------------------------------------------------------------------------
1719 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1720 const char *src
, size_t srcLen
) const
1722 srcLen
= GetLength(src
, srcLen
);
1723 if ( srcLen
== wxNO_LEN
)
1724 return wxCONV_FAILED
;
1726 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1727 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1729 for ( size_t n
= 0; n
< inLen
; n
++ )
1732 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1733 if ( numChars
== wxCONV_FAILED
)
1734 return wxCONV_FAILED
;
1739 if ( outLen
> dstLen
)
1740 return wxCONV_FAILED
;
1743 if ( numChars
== 2 )
1745 // second character of a surrogate
1755 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1756 const wchar_t *src
, size_t srcLen
) const
1758 if ( srcLen
== wxNO_LEN
)
1759 srcLen
= wxWcslen(src
) + 1;
1763 // optimization: return maximal space which could be needed for this
1764 // string instead of the exact amount which could be less if there are
1765 // any surrogates in the input
1767 // we consider that surrogates are rare enough to make it worthwhile to
1768 // avoid running the loop below at the cost of slightly extra memory
1770 return srcLen
* BYTES_PER_CHAR
;
1773 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1775 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1777 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1779 return wxCONV_FAILED
;
1781 outLen
+= BYTES_PER_CHAR
;
1783 if ( outLen
> dstLen
)
1784 return wxCONV_FAILED
;
1792 // ----------------------------------------------------------------------------
1793 // endian-reversing conversions
1794 // ----------------------------------------------------------------------------
1797 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1798 const char *src
, size_t srcLen
) const
1800 srcLen
= GetLength(src
, srcLen
);
1801 if ( srcLen
== wxNO_LEN
)
1802 return wxCONV_FAILED
;
1804 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1805 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1807 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1810 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1811 if ( numChars
== wxCONV_FAILED
)
1812 return wxCONV_FAILED
;
1817 if ( outLen
> dstLen
)
1818 return wxCONV_FAILED
;
1821 if ( numChars
== 2 )
1823 // second character of a surrogate
1833 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1834 const wchar_t *src
, size_t srcLen
) const
1836 if ( srcLen
== wxNO_LEN
)
1837 srcLen
= wxWcslen(src
) + 1;
1841 // optimization: return maximal space which could be needed for this
1842 // string instead of the exact amount which could be less if there are
1843 // any surrogates in the input
1845 // we consider that surrogates are rare enough to make it worthwhile to
1846 // avoid running the loop below at the cost of slightly extra memory
1848 return srcLen
*BYTES_PER_CHAR
;
1851 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1853 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1855 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1857 return wxCONV_FAILED
;
1859 outLen
+= BYTES_PER_CHAR
;
1861 if ( outLen
> dstLen
)
1862 return wxCONV_FAILED
;
1864 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1870 #else // !WC_UTF16: wchar_t is UTF-32
1872 // ----------------------------------------------------------------------------
1873 // conversions without endianness change
1874 // ----------------------------------------------------------------------------
1877 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1878 const char *src
, size_t srcLen
) const
1880 // use memcpy() as it should be much faster than hand-written loop
1881 srcLen
= GetLength(src
, srcLen
);
1882 if ( srcLen
== wxNO_LEN
)
1883 return wxCONV_FAILED
;
1885 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1888 if ( dstLen
< inLen
)
1889 return wxCONV_FAILED
;
1891 memcpy(dst
, src
, srcLen
);
1898 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1899 const wchar_t *src
, size_t srcLen
) const
1901 if ( srcLen
== wxNO_LEN
)
1902 srcLen
= wxWcslen(src
) + 1;
1904 srcLen
*= BYTES_PER_CHAR
;
1908 if ( dstLen
< srcLen
)
1909 return wxCONV_FAILED
;
1911 memcpy(dst
, src
, srcLen
);
1917 // ----------------------------------------------------------------------------
1918 // endian-reversing conversions
1919 // ----------------------------------------------------------------------------
1922 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1923 const char *src
, size_t srcLen
) const
1925 srcLen
= GetLength(src
, srcLen
);
1926 if ( srcLen
== wxNO_LEN
)
1927 return wxCONV_FAILED
;
1929 srcLen
/= BYTES_PER_CHAR
;
1933 if ( dstLen
< srcLen
)
1934 return wxCONV_FAILED
;
1936 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1937 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1939 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1947 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1948 const wchar_t *src
, size_t srcLen
) const
1950 if ( srcLen
== wxNO_LEN
)
1951 srcLen
= wxWcslen(src
) + 1;
1953 srcLen
*= BYTES_PER_CHAR
;
1957 if ( dstLen
< srcLen
)
1958 return wxCONV_FAILED
;
1960 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1961 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1963 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1970 #endif // WC_UTF16/!WC_UTF16
1973 // ============================================================================
1974 // The classes doing conversion using the iconv_xxx() functions
1975 // ============================================================================
1979 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1980 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1981 // (unless there's yet another bug in glibc) the only case when iconv()
1982 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1983 // left in the input buffer -- when _real_ error occurs,
1984 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1986 // [This bug does not appear in glibc 2.2.]
1987 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1988 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1989 (errno != E2BIG || bufLeft != 0))
1991 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1994 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1996 #define ICONV_T_INVALID ((iconv_t)-1)
1998 #if SIZEOF_WCHAR_T == 4
1999 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2000 #define WC_ENC wxFONTENCODING_UTF32
2001 #elif SIZEOF_WCHAR_T == 2
2002 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2003 #define WC_ENC wxFONTENCODING_UTF16
2004 #else // sizeof(wchar_t) != 2 nor 4
2005 // does this ever happen?
2006 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2009 // ----------------------------------------------------------------------------
2010 // wxMBConv_iconv: encapsulates an iconv character set
2011 // ----------------------------------------------------------------------------
2013 class wxMBConv_iconv
: public wxMBConv
2016 wxMBConv_iconv(const char *name
);
2017 virtual ~wxMBConv_iconv();
2019 // implement base class virtual methods
2020 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
2021 const char *src
, size_t srcLen
= wxNO_LEN
) const;
2022 virtual size_t FromWChar(char *dst
, size_t dstLen
,
2023 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
2024 virtual size_t GetMBNulLen() const;
2026 #if wxUSE_UNICODE_UTF8
2027 virtual bool IsUTF8() const;
2030 virtual wxMBConv
*Clone() const
2032 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
2033 p
->m_minMBCharWidth
= m_minMBCharWidth
;
2038 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
2041 // the iconv handlers used to translate from multibyte
2042 // to wide char and in the other direction
2047 // guards access to m2w and w2m objects
2048 wxMutex m_iconvMutex
;
2052 // the name (for iconv_open()) of a wide char charset -- if none is
2053 // available on this machine, it will remain NULL
2054 static wxString ms_wcCharsetName
;
2056 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2057 // different endian-ness than the native one
2058 static bool ms_wcNeedsSwap
;
2061 // name of the encoding handled by this conversion
2064 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2066 size_t m_minMBCharWidth
;
2069 // make the constructor available for unit testing
2070 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2072 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2073 if ( !result
->IsOk() )
2082 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2083 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2085 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2088 m_minMBCharWidth
= 0;
2090 // check for charset that represents wchar_t:
2091 if ( ms_wcCharsetName
.empty() )
2093 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
2096 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2097 #else // !wxUSE_FONTMAP
2098 static const wxChar
*names_static
[] =
2100 #if SIZEOF_WCHAR_T == 4
2102 #elif SIZEOF_WCHAR_T = 2
2107 const wxChar
**names
= names_static
;
2108 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2110 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2112 const wxString
nameCS(*names
);
2114 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2115 wxString
nameXE(nameCS
);
2117 #ifdef WORDS_BIGENDIAN
2119 #else // little endian
2123 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2126 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2127 if ( m2w
== ICONV_T_INVALID
)
2129 // try charset w/o bytesex info (e.g. "UCS4")
2130 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2132 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2134 // and check for bytesex ourselves:
2135 if ( m2w
!= ICONV_T_INVALID
)
2137 char buf
[2], *bufPtr
;
2146 outsz
= SIZEOF_WCHAR_T
* 2;
2147 char* wbufPtr
= (char*)wbuf
;
2151 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2154 if (ICONV_FAILED(res
, insz
))
2156 wxLogLastError(wxT("iconv"));
2157 wxLogError(_("Conversion to charset '%s' doesn't work."),
2160 else // ok, can convert to this encoding, remember it
2162 ms_wcCharsetName
= nameCS
;
2163 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2167 else // use charset not requiring byte swapping
2169 ms_wcCharsetName
= nameXE
;
2173 wxLogTrace(TRACE_STRCONV
,
2174 wxT("iconv wchar_t charset is \"%s\"%s"),
2175 ms_wcCharsetName
.empty() ? wxString("<none>")
2177 ms_wcNeedsSwap
? _T(" (needs swap)")
2180 else // we already have ms_wcCharsetName
2182 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2185 if ( ms_wcCharsetName
.empty() )
2187 w2m
= ICONV_T_INVALID
;
2191 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2192 if ( w2m
== ICONV_T_INVALID
)
2194 wxLogTrace(TRACE_STRCONV
,
2195 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2196 ms_wcCharsetName
.c_str(), name
);
2201 wxMBConv_iconv::~wxMBConv_iconv()
2203 if ( m2w
!= ICONV_T_INVALID
)
2205 if ( w2m
!= ICONV_T_INVALID
)
2210 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2211 const char *src
, size_t srcLen
) const
2213 if ( srcLen
== wxNO_LEN
)
2215 // find the string length: notice that must be done differently for
2216 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2218 const size_t nulLen
= GetMBNulLen();
2222 return wxCONV_FAILED
;
2225 srcLen
= strlen(src
); // arguably more optimized than our version
2230 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2231 // but they also have to start at character boundary and not
2232 // span two adjacent characters
2234 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2240 // when we're determining the length of the string ourselves we count
2241 // the terminating NUL(s) as part of it and always NUL-terminate the
2246 // we express length in the number of (wide) characters but iconv always
2247 // counts buffer sizes it in bytes
2248 dstLen
*= SIZEOF_WCHAR_T
;
2251 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2252 // Unfortunately there are a couple of global wxCSConv objects such as
2253 // wxConvLocal that are used all over wx code, so we have to make sure
2254 // the handle is used by at most one thread at the time. Otherwise
2255 // only a few wx classes would be safe to use from non-main threads
2256 // as MB<->WC conversion would fail "randomly".
2257 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2258 #endif // wxUSE_THREADS
2261 const char *pszPtr
= src
;
2265 char* bufPtr
= (char*)dst
;
2267 // have destination buffer, convert there
2268 size_t dstLenOrig
= dstLen
;
2270 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2273 // convert the number of bytes converted as returned by iconv to the
2274 // number of (wide) characters converted that we need
2275 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2279 // convert to native endianness
2280 for ( unsigned i
= 0; i
< res
; i
++ )
2281 dst
[i
] = WC_BSWAP(dst
[i
]);
2284 else // no destination buffer
2286 // convert using temp buffer to calculate the size of the buffer needed
2292 char* bufPtr
= (char*)tbuf
;
2293 dstLen
= 8 * SIZEOF_WCHAR_T
;
2296 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2299 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2301 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2304 if (ICONV_FAILED(cres
, srcLen
))
2306 //VS: it is ok if iconv fails, hence trace only
2307 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2308 return wxCONV_FAILED
;
2314 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2315 const wchar_t *src
, size_t srcLen
) const
2318 // NB: explained in MB2WC
2319 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2322 if ( srcLen
== wxNO_LEN
)
2323 srcLen
= wxWcslen(src
) + 1;
2325 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2326 size_t outbuflen
= dstLen
;
2329 wchar_t *tmpbuf
= 0;
2333 // need to copy to temp buffer to switch endianness
2334 // (doing WC_BSWAP twice on the original buffer won't help, as it
2335 // could be in read-only memory, or be accessed in some other thread)
2336 tmpbuf
= (wchar_t *)malloc(inbuflen
+ SIZEOF_WCHAR_T
);
2337 for ( size_t i
= 0; i
< srcLen
; i
++ )
2338 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2340 tmpbuf
[srcLen
] = L
'\0';
2344 char* inbuf
= (char*)src
;
2347 // have destination buffer, convert there
2348 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2350 res
= dstLen
- outbuflen
;
2352 else // no destination buffer
2354 // convert using temp buffer to calculate the size of the buffer needed
2362 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2364 res
+= 16 - outbuflen
;
2366 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2374 if (ICONV_FAILED(cres
, inbuflen
))
2376 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2377 return wxCONV_FAILED
;
2383 size_t wxMBConv_iconv::GetMBNulLen() const
2385 if ( m_minMBCharWidth
== 0 )
2387 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2390 // NB: explained in MB2WC
2391 wxMutexLocker
lock(self
->m_iconvMutex
);
2394 const wchar_t *wnul
= L
"";
2395 char buf
[8]; // should be enough for NUL in any encoding
2396 size_t inLen
= sizeof(wchar_t),
2397 outLen
= WXSIZEOF(buf
);
2398 char *inBuff
= (char *)wnul
;
2399 char *outBuff
= buf
;
2400 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2402 self
->m_minMBCharWidth
= (size_t)-1;
2406 self
->m_minMBCharWidth
= outBuff
- buf
;
2410 return m_minMBCharWidth
;
2413 #if wxUSE_UNICODE_UTF8
2414 bool wxMBConv_iconv::IsUTF8() const
2416 return wxStricmp(m_name
, "UTF-8") == 0 ||
2417 wxStricmp(m_name
, "UTF8") == 0;
2421 #endif // HAVE_ICONV
2424 // ============================================================================
2425 // Win32 conversion classes
2426 // ============================================================================
2428 #ifdef wxHAVE_WIN32_MB2WC
2432 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2433 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2436 class wxMBConv_win32
: public wxMBConv
2441 m_CodePage
= CP_ACP
;
2442 m_minMBCharWidth
= 0;
2445 wxMBConv_win32(const wxMBConv_win32
& conv
)
2448 m_CodePage
= conv
.m_CodePage
;
2449 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2453 wxMBConv_win32(const char* name
)
2455 m_CodePage
= wxCharsetToCodepage(name
);
2456 m_minMBCharWidth
= 0;
2459 wxMBConv_win32(wxFontEncoding encoding
)
2461 m_CodePage
= wxEncodingToCodepage(encoding
);
2462 m_minMBCharWidth
= 0;
2464 #endif // wxUSE_FONTMAP
2466 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2468 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2469 // the behaviour is not compatible with the Unix version (using iconv)
2470 // and break the library itself, e.g. wxTextInputStream::NextChar()
2471 // wouldn't work if reading an incomplete MB char didn't result in an
2474 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2475 // Win XP or newer and it is not supported for UTF-[78] so we always
2476 // use our own conversions in this case. See
2477 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2478 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2479 if ( m_CodePage
== CP_UTF8
)
2481 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2484 if ( m_CodePage
== CP_UTF7
)
2486 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2490 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2491 IsAtLeastWin2kSP4() )
2493 flags
= MB_ERR_INVALID_CHARS
;
2496 const size_t len
= ::MultiByteToWideChar
2498 m_CodePage
, // code page
2499 flags
, // flags: fall on error
2500 psz
, // input string
2501 -1, // its length (NUL-terminated)
2502 buf
, // output string
2503 buf
? n
: 0 // size of output buffer
2507 // function totally failed
2508 return wxCONV_FAILED
;
2511 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2512 // check if we succeeded, by doing a double trip:
2513 if ( !flags
&& buf
)
2515 const size_t mbLen
= strlen(psz
);
2516 wxCharBuffer
mbBuf(mbLen
);
2517 if ( ::WideCharToMultiByte
2524 mbLen
+ 1, // size in bytes, not length
2528 strcmp(mbBuf
, psz
) != 0 )
2530 // we didn't obtain the same thing we started from, hence
2531 // the conversion was lossy and we consider that it failed
2532 return wxCONV_FAILED
;
2536 // note that it returns count of written chars for buf != NULL and size
2537 // of the needed buffer for buf == NULL so in either case the length of
2538 // the string (which never includes the terminating NUL) is one less
2542 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2545 we have a problem here: by default, WideCharToMultiByte() may
2546 replace characters unrepresentable in the target code page with bad
2547 quality approximations such as turning "1/2" symbol (U+00BD) into
2548 "1" for the code pages which don't have it and we, obviously, want
2549 to avoid this at any price
2551 the trouble is that this function does it _silently_, i.e. it won't
2552 even tell us whether it did or not... Win98/2000 and higher provide
2553 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2554 we have to resort to a round trip, i.e. check that converting back
2555 results in the same string -- this is, of course, expensive but
2556 otherwise we simply can't be sure to not garble the data.
2559 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2560 // it doesn't work with CJK encodings (which we test for rather roughly
2561 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2563 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2566 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2568 // it's our lucky day
2569 flags
= WC_NO_BEST_FIT_CHARS
;
2570 pUsedDef
= &usedDef
;
2572 else // old system or unsupported encoding
2578 const size_t len
= ::WideCharToMultiByte
2580 m_CodePage
, // code page
2581 flags
, // either none or no best fit
2582 pwz
, // input string
2583 -1, // it is (wide) NUL-terminated
2584 buf
, // output buffer
2585 buf
? n
: 0, // and its size
2586 NULL
, // default "replacement" char
2587 pUsedDef
// [out] was it used?
2592 // function totally failed
2593 return wxCONV_FAILED
;
2596 // we did something, check if we really succeeded
2599 // check if the conversion failed, i.e. if any replacements
2602 return wxCONV_FAILED
;
2604 else // we must resort to double tripping...
2606 // first we need to ensure that we really have the MB data: this is
2607 // not the case if we're called with NULL buffer, in which case we
2608 // need to do the conversion yet again
2609 wxCharBuffer bufDef
;
2612 bufDef
= wxCharBuffer(len
);
2613 buf
= bufDef
.data();
2614 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2615 buf
, len
, NULL
, NULL
) )
2616 return wxCONV_FAILED
;
2621 wxWCharBuffer
wcBuf(n
);
2622 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2623 wcscmp(wcBuf
, pwz
) != 0 )
2625 // we didn't obtain the same thing we started from, hence
2626 // the conversion was lossy and we consider that it failed
2627 return wxCONV_FAILED
;
2631 // see the comment above for the reason of "len - 1"
2635 virtual size_t GetMBNulLen() const
2637 if ( m_minMBCharWidth
== 0 )
2639 int len
= ::WideCharToMultiByte
2641 m_CodePage
, // code page
2643 L
"", // input string
2644 1, // translate just the NUL
2645 NULL
, // output buffer
2647 NULL
, // no replacement char
2648 NULL
// [out] don't care if it was used
2651 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2655 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2656 self
->m_minMBCharWidth
= (size_t)-1;
2660 self
->m_minMBCharWidth
= (size_t)-1;
2666 self
->m_minMBCharWidth
= len
;
2671 return m_minMBCharWidth
;
2674 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2676 bool IsOk() const { return m_CodePage
!= -1; }
2679 static bool CanUseNoBestFit()
2681 static int s_isWin98Or2k
= -1;
2683 if ( s_isWin98Or2k
== -1 )
2686 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2688 case wxOS_WINDOWS_9X
:
2689 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2692 case wxOS_WINDOWS_NT
:
2693 s_isWin98Or2k
= verMaj
>= 5;
2697 // unknown: be conservative by default
2702 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2705 return s_isWin98Or2k
== 1;
2708 static bool IsAtLeastWin2kSP4()
2713 static int s_isAtLeastWin2kSP4
= -1;
2715 if ( s_isAtLeastWin2kSP4
== -1 )
2717 OSVERSIONINFOEX ver
;
2719 memset(&ver
, 0, sizeof(ver
));
2720 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2721 GetVersionEx((OSVERSIONINFO
*)&ver
);
2723 s_isAtLeastWin2kSP4
=
2724 ((ver
.dwMajorVersion
> 5) || // Vista+
2725 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2726 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2727 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2731 return s_isAtLeastWin2kSP4
== 1;
2736 // the code page we're working with
2739 // cached result of GetMBNulLen(), set to 0 initially meaning
2741 size_t m_minMBCharWidth
;
2744 #endif // wxHAVE_WIN32_MB2WC
2747 // ============================================================================
2748 // wxEncodingConverter based conversion classes
2749 // ============================================================================
2753 class wxMBConv_wxwin
: public wxMBConv
2758 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2759 // The wxMBConv_cf class does a better job.
2760 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2761 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2762 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2766 // temporarily just use wxEncodingConverter stuff,
2767 // so that it works while a better implementation is built
2768 wxMBConv_wxwin(const char* name
)
2771 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2773 m_enc
= wxFONTENCODING_SYSTEM
;
2778 wxMBConv_wxwin(wxFontEncoding enc
)
2785 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2787 size_t inbuf
= strlen(psz
);
2790 if (!m2w
.Convert(psz
, buf
))
2791 return wxCONV_FAILED
;
2796 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2798 const size_t inbuf
= wxWcslen(psz
);
2801 if (!w2m
.Convert(psz
, buf
))
2802 return wxCONV_FAILED
;
2808 virtual size_t GetMBNulLen() const
2812 case wxFONTENCODING_UTF16BE
:
2813 case wxFONTENCODING_UTF16LE
:
2816 case wxFONTENCODING_UTF32BE
:
2817 case wxFONTENCODING_UTF32LE
:
2825 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2827 bool IsOk() const { return m_ok
; }
2830 wxFontEncoding m_enc
;
2831 wxEncodingConverter m2w
, w2m
;
2834 // were we initialized successfully?
2837 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2840 // make the constructors available for unit testing
2841 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2843 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2844 if ( !result
->IsOk() )
2853 #endif // wxUSE_FONTMAP
2855 // ============================================================================
2856 // wxCSConv implementation
2857 // ============================================================================
2859 void wxCSConv::Init()
2866 wxCSConv::wxCSConv(const wxString
& charset
)
2870 if ( !charset
.empty() )
2872 SetName(charset
.ToAscii());
2876 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2878 m_encoding
= wxFONTENCODING_SYSTEM
;
2882 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2884 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2886 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2888 encoding
= wxFONTENCODING_SYSTEM
;
2893 m_encoding
= encoding
;
2896 wxCSConv::~wxCSConv()
2901 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2906 SetName(conv
.m_name
);
2907 m_encoding
= conv
.m_encoding
;
2910 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2914 SetName(conv
.m_name
);
2915 m_encoding
= conv
.m_encoding
;
2920 void wxCSConv::Clear()
2929 void wxCSConv::SetName(const char *charset
)
2933 m_name
= wxStrdup(charset
);
2940 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2941 wxEncodingNameCache
);
2943 static wxEncodingNameCache gs_nameCache
;
2946 wxMBConv
*wxCSConv::DoCreate() const
2949 wxLogTrace(TRACE_STRCONV
,
2950 wxT("creating conversion for %s"),
2952 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2953 #endif // wxUSE_FONTMAP
2955 // check for the special case of ASCII or ISO8859-1 charset: as we have
2956 // special knowledge of it anyhow, we don't need to create a special
2957 // conversion object
2958 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2959 m_encoding
== wxFONTENCODING_DEFAULT
)
2961 // don't convert at all
2965 // we trust OS to do conversion better than we can so try external
2966 // conversion methods first
2968 // the full order is:
2969 // 1. OS conversion (iconv() under Unix or Win32 API)
2970 // 2. hard coded conversions for UTF
2971 // 3. wxEncodingConverter as fall back
2977 #endif // !wxUSE_FONTMAP
2980 wxFontEncoding
encoding(m_encoding
);
2985 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2993 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2994 #endif // wxUSE_FONTMAP
2998 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2999 if ( it
!= gs_nameCache
.end() )
3001 if ( it
->second
.empty() )
3004 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
3011 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3012 // CS : in case this does not return valid names (eg for MacRoman)
3013 // encoding got a 'failure' entry in the cache all the same,
3014 // although it just has to be created using a different method, so
3015 // only store failed iconv creation attempts (or perhaps we
3016 // shoulnd't do this at all ?)
3017 if ( names
[0] != NULL
)
3019 for ( ; *names
; ++names
)
3021 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3022 // will need changes that will obsolete this
3023 wxString
name(*names
);
3024 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
3027 gs_nameCache
[encoding
] = *names
;
3034 gs_nameCache
[encoding
] = _T(""); // cache the failure
3037 #endif // wxUSE_FONTMAP
3039 #endif // HAVE_ICONV
3041 #ifdef wxHAVE_WIN32_MB2WC
3044 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3045 : new wxMBConv_win32(m_encoding
);
3054 #endif // wxHAVE_WIN32_MB2WC
3058 // leave UTF16 and UTF32 to the built-ins of wx
3059 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3060 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3063 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3064 : new wxMBConv_cf(m_encoding
);
3066 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3075 #endif // __DARWIN__
3078 wxFontEncoding enc
= m_encoding
;
3080 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3082 // use "false" to suppress interactive dialogs -- we can be called from
3083 // anywhere and popping up a dialog from here is the last thing we want to
3085 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3087 #endif // wxUSE_FONTMAP
3091 case wxFONTENCODING_UTF7
:
3092 return new wxMBConvUTF7
;
3094 case wxFONTENCODING_UTF8
:
3095 return new wxMBConvUTF8
;
3097 case wxFONTENCODING_UTF16BE
:
3098 return new wxMBConvUTF16BE
;
3100 case wxFONTENCODING_UTF16LE
:
3101 return new wxMBConvUTF16LE
;
3103 case wxFONTENCODING_UTF32BE
:
3104 return new wxMBConvUTF32BE
;
3106 case wxFONTENCODING_UTF32LE
:
3107 return new wxMBConvUTF32LE
;
3110 // nothing to do but put here to suppress gcc warnings
3117 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3118 : new wxMBConv_wxwin(m_encoding
);
3124 #endif // wxUSE_FONTMAP
3126 // NB: This is a hack to prevent deadlock. What could otherwise happen
3127 // in Unicode build: wxConvLocal creation ends up being here
3128 // because of some failure and logs the error. But wxLog will try to
3129 // attach a timestamp, for which it will need wxConvLocal (to convert
3130 // time to char* and then wchar_t*), but that fails, tries to log the
3131 // error, but wxLog has an (already locked) critical section that
3132 // guards the static buffer.
3133 static bool alreadyLoggingError
= false;
3134 if (!alreadyLoggingError
)
3136 alreadyLoggingError
= true;
3137 wxLogError(_("Cannot convert from the charset '%s'!"),
3141 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
3142 #else // !wxUSE_FONTMAP
3143 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
3144 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3147 alreadyLoggingError
= false;
3153 void wxCSConv::CreateConvIfNeeded() const
3157 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3159 // if we don't have neither the name nor the encoding, use the default
3160 // encoding for this system
3161 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3164 self
->m_encoding
= wxLocale::GetSystemEncoding();
3166 // fallback to some reasonable default:
3167 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3168 #endif // wxUSE_INTL
3171 self
->m_convReal
= DoCreate();
3172 self
->m_deferred
= false;
3176 bool wxCSConv::IsOk() const
3178 CreateConvIfNeeded();
3180 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3181 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3182 return true; // always ok as we do it ourselves
3184 // m_convReal->IsOk() is called at its own creation, so we know it must
3185 // be ok if m_convReal is non-NULL
3186 return m_convReal
!= NULL
;
3189 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3190 const char *src
, size_t srcLen
) const
3192 CreateConvIfNeeded();
3195 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3198 if ( srcLen
== wxNO_LEN
)
3199 srcLen
= strlen(src
) + 1; // take trailing NUL too
3203 if ( dstLen
< srcLen
)
3204 return wxCONV_FAILED
;
3206 for ( size_t n
= 0; n
< srcLen
; n
++ )
3207 dst
[n
] = (unsigned char)(src
[n
]);
3213 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3214 const wchar_t *src
, size_t srcLen
) const
3216 CreateConvIfNeeded();
3219 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3222 if ( srcLen
== wxNO_LEN
)
3223 srcLen
= wxWcslen(src
) + 1;
3227 if ( dstLen
< srcLen
)
3228 return wxCONV_FAILED
;
3230 for ( size_t n
= 0; n
< srcLen
; n
++ )
3232 if ( src
[n
] > 0xFF )
3233 return wxCONV_FAILED
;
3235 dst
[n
] = (char)src
[n
];
3239 else // still need to check the input validity
3241 for ( size_t n
= 0; n
< srcLen
; n
++ )
3243 if ( src
[n
] > 0xFF )
3244 return wxCONV_FAILED
;
3251 size_t wxCSConv::GetMBNulLen() const
3253 CreateConvIfNeeded();
3257 return m_convReal
->GetMBNulLen();
3260 // otherwise, we are ISO-8859-1
3264 #if wxUSE_UNICODE_UTF8
3265 bool wxCSConv::IsUTF8() const
3267 CreateConvIfNeeded();
3271 return m_convReal
->IsUTF8();
3274 // otherwise, we are ISO-8859-1
3282 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3285 return wxWCharBuffer();
3287 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3289 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3291 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3296 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3299 return wxCharBuffer();
3301 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3303 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3308 #endif // wxUSE_UNICODE
3310 // ----------------------------------------------------------------------------
3312 // ----------------------------------------------------------------------------
3314 // NB: The reason why we create converted objects in this convoluted way,
3315 // using a factory function instead of global variable, is that they
3316 // may be used at static initialization time (some of them are used by
3317 // wxString ctors and there may be a global wxString object). In other
3318 // words, possibly _before_ the converter global object would be
3325 #undef wxConvISO8859_1
3327 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3328 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3329 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3331 static impl_klass name##Obj ctor_args; \
3332 return &name##Obj; \
3334 /* this ensures that all global converter objects are created */ \
3335 /* by the time static initialization is done, i.e. before any */ \
3336 /* thread is launched: */ \
3337 static klass* gs_##name##instance = wxGet_##name##Ptr()
3339 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3340 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3343 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3345 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3348 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3349 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3350 // provokes an error message about "not enough macro parameters"; and we
3351 // can't use "()" here as the name##Obj declaration would be parsed as a
3352 // function declaration then, so use a semicolon and live with an extra
3353 // empty statement (and hope that no compilers warns about this)
3354 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3355 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3357 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3358 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3360 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3361 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3364 // The xnu kernel always communicates file paths in decomposed UTF-8.
3365 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3366 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3369 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3372 #else // !__DARWIN__
3373 wxGet_wxConvLibcPtr();
3374 #endif // __DARWIN__/!__DARWIN__
3376 #else // !wxUSE_WCHAR_T
3378 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3379 // stand-ins in absence of wchar_t
3380 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3385 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T