1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV _T("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(reinterpret_cast<const wxUint16
*>(*pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten
= 0;
176 // the number of NULs terminating this string
177 size_t nulLen
= 0; // not really needed, but just to avoid warnings
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
185 if ( srcLen
!= wxNO_LEN
)
187 // we need to know how to find the end of this string
188 nulLen
= GetMBNulLen();
189 if ( nulLen
== wxCONV_FAILED
)
190 return wxCONV_FAILED
;
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
197 char * const p
= bufTmp
.data();
198 memcpy(p
, src
, srcLen
);
199 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
205 srcEnd
= src
+ srcLen
;
207 else // quit after the first loop iteration
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
229 // try to convert the current chunk
230 size_t lenChunk
= MB2WC(NULL
, src
, 0);
231 if ( lenChunk
== wxCONV_FAILED
)
232 return wxCONV_FAILED
;
234 dstWritten
+= lenChunk
;
240 // nothing left in the input string, conversion succeeded
246 if ( dstWritten
> dstLen
)
247 return wxCONV_FAILED
;
249 // +1 is for trailing NUL
250 if ( MB2WC(dst
, src
, lenChunk
+ 1) == wxCONV_FAILED
)
251 return wxCONV_FAILED
;
260 // we convert just one chunk in this case as this is the entire
265 // advance the input pointer past the end of this chunk
266 while ( NotAllNULs(src
, nulLen
) )
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
275 src
+= nulLen
; // skipping over its terminator as well
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
279 // delimited by srcEnd
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
298 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
299 const wchar_t *src
, size_t srcLen
) const
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten
= 0;
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
309 // make a copy of the input string unless it is already properly
311 wxWCharBuffer bufTmp
;
312 if ( isNulTerminated
)
314 srcLen
= wxWcslen(src
) + 1;
316 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
318 // make a copy in order to properly NUL-terminate the string
319 bufTmp
= wxWCharBuffer(srcLen
);
320 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
324 const size_t lenNul
= GetMBNulLen();
325 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
327 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
329 // try to convert the current chunk
330 size_t lenChunk
= WC2MB(NULL
, src
, 0);
332 if ( lenChunk
== wxCONV_FAILED
)
333 return wxCONV_FAILED
;
335 dstWritten
+= lenChunk
;
336 if ( isNulTerminated
)
337 dstWritten
+= lenNul
;
341 if ( dstWritten
> dstLen
)
342 return wxCONV_FAILED
;
344 if ( WC2MB(dst
, src
, lenChunk
+ lenNul
) == wxCONV_FAILED
)
345 return wxCONV_FAILED
;
348 if ( isNulTerminated
)
356 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
358 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
359 if ( rc
!= wxCONV_FAILED
)
361 // ToWChar() returns the buffer length, i.e. including the trailing
362 // NUL, while this method doesn't take it into account
369 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
371 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
372 if ( rc
!= wxCONV_FAILED
)
380 wxMBConv::~wxMBConv()
382 // nothing to do here (necessary for Darwin linking probably)
385 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
389 // calculate the length of the buffer needed first
390 const size_t nLen
= ToWChar(NULL
, 0, psz
);
391 if ( nLen
!= wxCONV_FAILED
)
393 // now do the actual conversion
394 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
396 // +1 for the trailing NULL
397 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
402 return wxWCharBuffer();
405 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
409 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
410 if ( nLen
!= wxCONV_FAILED
)
412 wxCharBuffer
buf(nLen
- 1);
413 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
418 return wxCharBuffer();
422 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
424 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
425 if ( dstLen
!= wxCONV_FAILED
)
427 // notice that we allocate space for dstLen+1 wide characters here
428 // because we want the buffer to always be NUL-terminated, even if the
429 // input isn't (as otherwise the caller has no way to know its length)
430 wxWCharBuffer
wbuf(dstLen
);
431 wbuf
.data()[dstLen
] = L
'\0';
432 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
438 // we also need to handle NUL-terminated input strings
439 // specially: for them the output is the length of the string
440 // excluding the trailing NUL, however if we're asked to
441 // convert a specific number of characters we return the length
442 // of the resulting output even if it's NUL-terminated
443 if ( inLen
== wxNO_LEN
)
454 return wxWCharBuffer();
458 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
460 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
461 if ( dstLen
!= wxCONV_FAILED
)
463 const size_t nulLen
= GetMBNulLen();
465 // as above, ensure that the buffer is always NUL-terminated, even if
467 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
468 memset(buf
.data() + dstLen
, 0, nulLen
);
469 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
475 if ( inLen
== wxNO_LEN
)
477 // in this case both input and output are NUL-terminated
478 // and we're not supposed to count NUL
490 return wxCharBuffer();
493 // ----------------------------------------------------------------------------
495 // ----------------------------------------------------------------------------
497 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
499 return wxMB2WC(buf
, psz
, n
);
502 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
504 return wxWC2MB(buf
, psz
, n
);
507 // ----------------------------------------------------------------------------
508 // wxConvBrokenFileNames
509 // ----------------------------------------------------------------------------
513 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
515 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
516 wxStricmp(charset
, _T("UTF8")) == 0 )
517 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
519 m_conv
= new wxCSConv(charset
);
524 // ----------------------------------------------------------------------------
526 // ----------------------------------------------------------------------------
528 // Implementation (C) 2004 Fredrik Roubert
530 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
533 // BASE64 decoding table
535 static const unsigned char utf7unb64
[] =
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
540 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
541 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
542 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
543 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
544 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
545 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
546 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
547 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
548 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
549 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
550 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
551 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
552 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
553 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
554 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
555 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
556 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
557 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
558 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
559 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
560 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
561 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
562 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
563 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
564 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
565 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
566 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
567 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
568 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
571 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
572 const char *src
, size_t srcLen
) const
574 DecoderState stateOrig
,
576 if ( srcLen
== wxNO_LEN
)
578 // convert the entire string, up to and including the trailing NUL
579 srcLen
= strlen(src
) + 1;
581 // when working on the entire strings we don't update nor use the shift
582 // state from the previous call
583 statePtr
= &stateOrig
;
585 else // when working with partial strings we do use the shift state
587 statePtr
= const_cast<DecoderState
*>(&m_stateDecoder
);
589 // also save the old state to be able to rollback to it on error
590 stateOrig
= m_stateDecoder
;
593 // but to simplify the code below we use this variable in both cases
594 DecoderState
& state
= *statePtr
;
597 // number of characters [which would have been] written to dst [if it were
601 const char * const srcEnd
= src
+ srcLen
;
603 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
605 const unsigned char cc
= *src
++;
607 if ( state
.IsShifted() )
609 const unsigned char dc
= utf7unb64
[cc
];
612 // end of encoded part, check that nothing was left: there can
613 // be up to 4 bits of 0 padding but nothing else (we also need
614 // to check isLSB as we count bits modulo 8 while a valid UTF-7
615 // encoded sequence must contain an integral number of UTF-16
617 if ( state
.isLSB
|| state
.bit
> 4 ||
618 (state
.accum
& ((1 << state
.bit
) - 1)) )
623 return wxCONV_FAILED
;
628 // re-parse this character normally below unless it's '-' which
629 // is consumed by the decoder
633 else // valid encoded character
635 // mini base64 decoder: each character is 6 bits
640 if ( state
.bit
>= 8 )
642 // got the full byte, consume it
644 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
648 // we've got the full word, output it
650 *dst
++ = (state
.msb
<< 8) | b
;
656 // just store it while we wait for LSB
664 if ( state
.IsDirect() )
666 // start of an encoded segment?
671 // just the encoded plus sign, don't switch to shifted mode
677 else if ( utf7unb64
[(unsigned)*src
] == 0xff )
679 // empty encoded chunks are not allowed
683 return wxCONV_FAILED
;
685 else // base-64 encoded chunk follows
692 // only printable 7 bit ASCII characters (with the exception of
693 // NUL, TAB, CR and LF) can be used directly
694 if ( cc
>= 0x7f || (cc
< ' ' &&
695 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
696 return wxCONV_FAILED
;
707 // as we didn't read any characters we should be called with the same
708 // data (followed by some more new data) again later so don't save our
712 return wxCONV_FAILED
;
719 // BASE64 encoding table
721 static const unsigned char utf7enb64
[] =
723 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
724 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
725 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
726 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
727 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
728 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
729 'w', 'x', 'y', 'z', '0', '1', '2', '3',
730 '4', '5', '6', '7', '8', '9', '+', '/'
734 // UTF-7 encoding table
736 // 0 - Set D (directly encoded characters)
737 // 1 - Set O (optional direct characters)
738 // 2 - whitespace characters (optional)
739 // 3 - special characters
741 static const unsigned char utf7encode
[128] =
743 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
744 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
745 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
747 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
749 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
753 static inline bool wxIsUTF7Direct(wchar_t wc
)
755 return wc
< 0x80 && utf7encode
[wc
] < 1;
758 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
759 const wchar_t *src
, size_t srcLen
) const
761 EncoderState stateOrig
,
763 if ( srcLen
== wxNO_LEN
)
765 // we don't apply the stored state when operating on entire strings at
767 statePtr
= &stateOrig
;
769 srcLen
= wxWcslen(src
) + 1;
771 else // do use the mode we left the output in previously
773 stateOrig
= m_stateEncoder
;
774 statePtr
= const_cast<EncoderState
*>(&m_stateEncoder
);
777 EncoderState
& state
= *statePtr
;
782 const wchar_t * const srcEnd
= src
+ srcLen
;
783 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
786 if ( wxIsUTF7Direct(cc
) )
788 if ( state
.IsShifted() )
790 // pad with zeros the last encoded block if necessary
794 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
809 else if ( cc
== '+' && state
.IsDirect() )
820 else if (((wxUint32
)cc
) > 0xffff)
822 // no surrogate pair generation (yet?)
823 return wxCONV_FAILED
;
828 if ( state
.IsDirect() )
837 // BASE64 encode string
840 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
843 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
845 for (state
.bit
+= 8; state
.bit
>= 6; )
849 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
854 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
862 // we need to restore the original encoder state if we were called just to
863 // calculate the amount of space needed as we will presumably be called
864 // again to really convert the data now
871 // ----------------------------------------------------------------------------
873 // ----------------------------------------------------------------------------
875 static const wxUint32 utf8_max
[]=
876 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
878 // boundaries of the private use area we use to (temporarily) remap invalid
879 // characters invalid in a UTF-8 encoded string
880 const wxUint32 wxUnicodePUA
= 0x100000;
881 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
883 // this table gives the length of the UTF-8 encoding from its first character:
884 const unsigned char tableUtf8Lengths
[256] = {
885 // single-byte sequences (ASCII):
886 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
887 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
891 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
892 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
895 // these are invalid:
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
902 // two-byte sequences:
903 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
904 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
906 // three-byte sequences:
907 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
909 // four-byte sequences:
910 4, 4, 4, 4, 4, // F0..F4
912 // these are invalid again (5- or 6-byte
913 // sequences and sequences for code points
914 // above U+10FFFF, as restricted by RFC 3629):
915 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
919 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
920 const char *src
, size_t srcLen
) const
922 wchar_t *out
= dstLen
? dst
: NULL
;
925 if ( srcLen
== wxNO_LEN
)
926 srcLen
= strlen(src
) + 1;
928 for ( const char *p
= src
; ; p
++ )
930 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
932 // all done successfully, just add the trailing NULL if we are not
933 // using explicit length
934 if ( srcLen
== wxNO_LEN
)
950 if ( out
&& !dstLen
-- )
954 unsigned char c
= *p
;
958 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
961 if ( srcLen
!= wxNO_LEN
)
968 unsigned len
= tableUtf8Lengths
[c
];
972 if ( srcLen
< len
) // the test works for wxNO_LEN too
975 if ( srcLen
!= wxNO_LEN
)
978 // Char. number range | UTF-8 octet sequence
979 // (hexadecimal) | (binary)
980 // ----------------------+----------------------------------------
981 // 0000 0000 - 0000 007F | 0xxxxxxx
982 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
983 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
984 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
986 // Code point value is stored in bits marked with 'x',
987 // lowest-order bit of the value on the right side in the diagram
988 // above. (from RFC 3629)
990 // mask to extract lead byte's value ('x' bits above), by sequence
992 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
994 // mask and value of lead byte's most significant bits, by length:
995 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
996 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
998 len
--; // it's more convenient to work with 0-based length here
1000 // extract the lead byte's value bits:
1001 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
1004 code
= c
& leadValueMask
[len
];
1006 // all remaining bytes, if any, are handled in the same way
1007 // regardless of sequence's length:
1008 for ( ; len
; --len
)
1011 if ( (c
& 0xC0) != 0x80 )
1012 return wxCONV_FAILED
;
1020 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1021 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
1030 #endif // WC_UTF16/!WC_UTF16
1038 return wxCONV_FAILED
;
1042 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
1043 const wchar_t *src
, size_t srcLen
) const
1045 char *out
= dstLen
? dst
: NULL
;
1048 for ( const wchar_t *wp
= src
; ; wp
++ )
1050 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
) )
1052 // all done successfully, just add the trailing NULL if we are not
1053 // using explicit length
1054 if ( srcLen
== wxNO_LEN
)
1070 if ( srcLen
!= wxNO_LEN
)
1075 // cast is ok for WC_UTF16
1076 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1078 // skip the next char too as we decoded a surrogate
1081 #else // wchar_t is UTF-32
1082 code
= *wp
& 0x7fffffff;
1094 out
[0] = (char)code
;
1097 else if ( code
<= 0x07FF )
1105 // NB: this line takes 6 least significant bits, encodes them as
1106 // 10xxxxxx and discards them so that the next byte can be encoded:
1107 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1108 out
[0] = 0xC0 | code
;
1111 else if ( code
< 0xFFFF )
1119 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1120 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1121 out
[0] = 0xE0 | code
;
1124 else if ( code
<= 0x10FFFF )
1132 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1133 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1134 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1135 out
[0] = 0xF0 | code
;
1140 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1153 // we only get here if an error occurs during decoding
1154 return wxCONV_FAILED
;
1157 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1158 const char *psz
, size_t srcLen
) const
1160 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1161 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1165 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1167 const char *opsz
= psz
;
1168 bool invalid
= false;
1169 unsigned char cc
= *psz
++, fc
= cc
;
1171 for (cnt
= 0; fc
& 0x80; cnt
++)
1181 // escape the escape character for octal escapes
1182 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1183 && cc
== '\\' && (!buf
|| len
< n
))
1195 // invalid UTF-8 sequence
1200 unsigned ocnt
= cnt
- 1;
1201 wxUint32 res
= cc
& (0x3f >> cnt
);
1205 if ((cc
& 0xC0) != 0x80)
1207 // invalid UTF-8 sequence
1213 res
= (res
<< 6) | (cc
& 0x3f);
1216 if (invalid
|| res
<= utf8_max
[ocnt
])
1218 // illegal UTF-8 encoding
1221 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1222 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1224 // if one of our PUA characters turns up externally
1225 // it must also be treated as an illegal sequence
1226 // (a bit like you have to escape an escape character)
1232 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1233 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1234 if (pa
== wxCONV_FAILED
)
1246 *buf
++ = (wchar_t)res
;
1248 #endif // WC_UTF16/!WC_UTF16
1254 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1256 while (opsz
< psz
&& (!buf
|| len
< n
))
1259 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1260 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1261 wxASSERT(pa
!= wxCONV_FAILED
);
1268 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1274 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1276 while (opsz
< psz
&& (!buf
|| len
< n
))
1278 if ( buf
&& len
+ 3 < n
)
1280 unsigned char on
= *opsz
;
1282 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1283 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1284 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1291 else // MAP_INVALID_UTF8_NOT
1293 return wxCONV_FAILED
;
1299 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1305 static inline bool isoctal(wchar_t wch
)
1307 return L
'0' <= wch
&& wch
<= L
'7';
1310 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1311 const wchar_t *psz
, size_t srcLen
) const
1313 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1314 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1318 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1323 // cast is ok for WC_UTF16
1324 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1325 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1327 cc
= (*psz
++) & 0x7fffffff;
1330 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1331 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1334 *buf
++ = (char)(cc
- wxUnicodePUA
);
1337 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1338 && cc
== L
'\\' && psz
[0] == L
'\\' )
1345 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1347 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1351 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1352 (psz
[1] - L
'0') * 010 +
1362 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1378 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1380 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1386 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1392 // ============================================================================
1394 // ============================================================================
1396 #ifdef WORDS_BIGENDIAN
1397 #define wxMBConvUTF16straight wxMBConvUTF16BE
1398 #define wxMBConvUTF16swap wxMBConvUTF16LE
1400 #define wxMBConvUTF16swap wxMBConvUTF16BE
1401 #define wxMBConvUTF16straight wxMBConvUTF16LE
1405 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1407 if ( srcLen
== wxNO_LEN
)
1409 // count the number of bytes in input, including the trailing NULs
1410 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1411 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1414 srcLen
*= BYTES_PER_CHAR
;
1416 else // we already have the length
1418 // we can only convert an entire number of UTF-16 characters
1419 if ( srcLen
% BYTES_PER_CHAR
)
1420 return wxCONV_FAILED
;
1426 // case when in-memory representation is UTF-16 too
1429 // ----------------------------------------------------------------------------
1430 // conversions without endianness change
1431 // ----------------------------------------------------------------------------
1434 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1435 const char *src
, size_t srcLen
) const
1437 // set up the scene for using memcpy() (which is presumably more efficient
1438 // than copying the bytes one by one)
1439 srcLen
= GetLength(src
, srcLen
);
1440 if ( srcLen
== wxNO_LEN
)
1441 return wxCONV_FAILED
;
1443 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1446 if ( dstLen
< inLen
)
1447 return wxCONV_FAILED
;
1449 memcpy(dst
, src
, srcLen
);
1456 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1457 const wchar_t *src
, size_t srcLen
) const
1459 if ( srcLen
== wxNO_LEN
)
1460 srcLen
= wxWcslen(src
) + 1;
1462 srcLen
*= BYTES_PER_CHAR
;
1466 if ( dstLen
< srcLen
)
1467 return wxCONV_FAILED
;
1469 memcpy(dst
, src
, srcLen
);
1475 // ----------------------------------------------------------------------------
1476 // endian-reversing conversions
1477 // ----------------------------------------------------------------------------
1480 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1481 const char *src
, size_t srcLen
) const
1483 srcLen
= GetLength(src
, srcLen
);
1484 if ( srcLen
== wxNO_LEN
)
1485 return wxCONV_FAILED
;
1487 srcLen
/= BYTES_PER_CHAR
;
1491 if ( dstLen
< srcLen
)
1492 return wxCONV_FAILED
;
1494 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1495 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1497 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1505 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1506 const wchar_t *src
, size_t srcLen
) const
1508 if ( srcLen
== wxNO_LEN
)
1509 srcLen
= wxWcslen(src
) + 1;
1511 srcLen
*= BYTES_PER_CHAR
;
1515 if ( dstLen
< srcLen
)
1516 return wxCONV_FAILED
;
1518 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1519 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1521 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1528 #else // !WC_UTF16: wchar_t is UTF-32
1530 // ----------------------------------------------------------------------------
1531 // conversions without endianness change
1532 // ----------------------------------------------------------------------------
1535 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1536 const char *src
, size_t srcLen
) const
1538 srcLen
= GetLength(src
, srcLen
);
1539 if ( srcLen
== wxNO_LEN
)
1540 return wxCONV_FAILED
;
1542 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1545 // optimization: return maximal space which could be needed for this
1546 // string even if the real size could be smaller if the buffer contains
1552 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1553 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1555 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1557 return wxCONV_FAILED
;
1559 if ( ++outLen
> dstLen
)
1560 return wxCONV_FAILED
;
1570 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1571 const wchar_t *src
, size_t srcLen
) const
1573 if ( srcLen
== wxNO_LEN
)
1574 srcLen
= wxWcslen(src
) + 1;
1577 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1578 for ( size_t n
= 0; n
< srcLen
; n
++ )
1581 const size_t numChars
= encode_utf16(*src
++, cc
);
1582 if ( numChars
== wxCONV_FAILED
)
1583 return wxCONV_FAILED
;
1585 outLen
+= numChars
* BYTES_PER_CHAR
;
1588 if ( outLen
> dstLen
)
1589 return wxCONV_FAILED
;
1592 if ( numChars
== 2 )
1594 // second character of a surrogate
1603 // ----------------------------------------------------------------------------
1604 // endian-reversing conversions
1605 // ----------------------------------------------------------------------------
1608 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1609 const char *src
, size_t srcLen
) const
1611 srcLen
= GetLength(src
, srcLen
);
1612 if ( srcLen
== wxNO_LEN
)
1613 return wxCONV_FAILED
;
1615 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1618 // optimization: return maximal space which could be needed for this
1619 // string even if the real size could be smaller if the buffer contains
1625 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1626 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1631 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1633 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1635 const size_t numChars
= decode_utf16(tmp
, ch
);
1636 if ( numChars
== wxCONV_FAILED
)
1637 return wxCONV_FAILED
;
1639 if ( numChars
== 2 )
1642 if ( ++outLen
> dstLen
)
1643 return wxCONV_FAILED
;
1653 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1654 const wchar_t *src
, size_t srcLen
) const
1656 if ( srcLen
== wxNO_LEN
)
1657 srcLen
= wxWcslen(src
) + 1;
1660 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1661 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1664 const size_t numChars
= encode_utf16(*src
, cc
);
1665 if ( numChars
== wxCONV_FAILED
)
1666 return wxCONV_FAILED
;
1668 outLen
+= numChars
* BYTES_PER_CHAR
;
1671 if ( outLen
> dstLen
)
1672 return wxCONV_FAILED
;
1674 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1675 if ( numChars
== 2 )
1677 // second character of a surrogate
1678 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1686 #endif // WC_UTF16/!WC_UTF16
1689 // ============================================================================
1691 // ============================================================================
1693 #ifdef WORDS_BIGENDIAN
1694 #define wxMBConvUTF32straight wxMBConvUTF32BE
1695 #define wxMBConvUTF32swap wxMBConvUTF32LE
1697 #define wxMBConvUTF32swap wxMBConvUTF32BE
1698 #define wxMBConvUTF32straight wxMBConvUTF32LE
1702 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1703 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1706 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1708 if ( srcLen
== wxNO_LEN
)
1710 // count the number of bytes in input, including the trailing NULs
1711 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1712 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1715 srcLen
*= BYTES_PER_CHAR
;
1717 else // we already have the length
1719 // we can only convert an entire number of UTF-32 characters
1720 if ( srcLen
% BYTES_PER_CHAR
)
1721 return wxCONV_FAILED
;
1727 // case when in-memory representation is UTF-16
1730 // ----------------------------------------------------------------------------
1731 // conversions without endianness change
1732 // ----------------------------------------------------------------------------
1735 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1736 const char *src
, size_t srcLen
) const
1738 srcLen
= GetLength(src
, srcLen
);
1739 if ( srcLen
== wxNO_LEN
)
1740 return wxCONV_FAILED
;
1742 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1743 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1745 for ( size_t n
= 0; n
< inLen
; n
++ )
1748 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1749 if ( numChars
== wxCONV_FAILED
)
1750 return wxCONV_FAILED
;
1755 if ( outLen
> dstLen
)
1756 return wxCONV_FAILED
;
1759 if ( numChars
== 2 )
1761 // second character of a surrogate
1771 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1772 const wchar_t *src
, size_t srcLen
) const
1774 if ( srcLen
== wxNO_LEN
)
1775 srcLen
= wxWcslen(src
) + 1;
1779 // optimization: return maximal space which could be needed for this
1780 // string instead of the exact amount which could be less if there are
1781 // any surrogates in the input
1783 // we consider that surrogates are rare enough to make it worthwhile to
1784 // avoid running the loop below at the cost of slightly extra memory
1786 return srcLen
* BYTES_PER_CHAR
;
1789 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1791 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1793 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1795 return wxCONV_FAILED
;
1797 outLen
+= BYTES_PER_CHAR
;
1799 if ( outLen
> dstLen
)
1800 return wxCONV_FAILED
;
1808 // ----------------------------------------------------------------------------
1809 // endian-reversing conversions
1810 // ----------------------------------------------------------------------------
1813 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1814 const char *src
, size_t srcLen
) const
1816 srcLen
= GetLength(src
, srcLen
);
1817 if ( srcLen
== wxNO_LEN
)
1818 return wxCONV_FAILED
;
1820 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1821 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1823 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1826 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1827 if ( numChars
== wxCONV_FAILED
)
1828 return wxCONV_FAILED
;
1833 if ( outLen
> dstLen
)
1834 return wxCONV_FAILED
;
1837 if ( numChars
== 2 )
1839 // second character of a surrogate
1849 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1850 const wchar_t *src
, size_t srcLen
) const
1852 if ( srcLen
== wxNO_LEN
)
1853 srcLen
= wxWcslen(src
) + 1;
1857 // optimization: return maximal space which could be needed for this
1858 // string instead of the exact amount which could be less if there are
1859 // any surrogates in the input
1861 // we consider that surrogates are rare enough to make it worthwhile to
1862 // avoid running the loop below at the cost of slightly extra memory
1864 return srcLen
*BYTES_PER_CHAR
;
1867 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1869 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1871 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1873 return wxCONV_FAILED
;
1875 outLen
+= BYTES_PER_CHAR
;
1877 if ( outLen
> dstLen
)
1878 return wxCONV_FAILED
;
1880 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1886 #else // !WC_UTF16: wchar_t is UTF-32
1888 // ----------------------------------------------------------------------------
1889 // conversions without endianness change
1890 // ----------------------------------------------------------------------------
1893 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1894 const char *src
, size_t srcLen
) const
1896 // use memcpy() as it should be much faster than hand-written loop
1897 srcLen
= GetLength(src
, srcLen
);
1898 if ( srcLen
== wxNO_LEN
)
1899 return wxCONV_FAILED
;
1901 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1904 if ( dstLen
< inLen
)
1905 return wxCONV_FAILED
;
1907 memcpy(dst
, src
, srcLen
);
1914 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1915 const wchar_t *src
, size_t srcLen
) const
1917 if ( srcLen
== wxNO_LEN
)
1918 srcLen
= wxWcslen(src
) + 1;
1920 srcLen
*= BYTES_PER_CHAR
;
1924 if ( dstLen
< srcLen
)
1925 return wxCONV_FAILED
;
1927 memcpy(dst
, src
, srcLen
);
1933 // ----------------------------------------------------------------------------
1934 // endian-reversing conversions
1935 // ----------------------------------------------------------------------------
1938 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1939 const char *src
, size_t srcLen
) const
1941 srcLen
= GetLength(src
, srcLen
);
1942 if ( srcLen
== wxNO_LEN
)
1943 return wxCONV_FAILED
;
1945 srcLen
/= BYTES_PER_CHAR
;
1949 if ( dstLen
< srcLen
)
1950 return wxCONV_FAILED
;
1952 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1953 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1955 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1963 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1964 const wchar_t *src
, size_t srcLen
) const
1966 if ( srcLen
== wxNO_LEN
)
1967 srcLen
= wxWcslen(src
) + 1;
1969 srcLen
*= BYTES_PER_CHAR
;
1973 if ( dstLen
< srcLen
)
1974 return wxCONV_FAILED
;
1976 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1977 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1979 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1986 #endif // WC_UTF16/!WC_UTF16
1989 // ============================================================================
1990 // The classes doing conversion using the iconv_xxx() functions
1991 // ============================================================================
1995 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1996 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1997 // (unless there's yet another bug in glibc) the only case when iconv()
1998 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1999 // left in the input buffer -- when _real_ error occurs,
2000 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2002 // [This bug does not appear in glibc 2.2.]
2003 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2004 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2005 (errno != E2BIG || bufLeft != 0))
2007 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2010 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2012 #define ICONV_T_INVALID ((iconv_t)-1)
2014 #if SIZEOF_WCHAR_T == 4
2015 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2016 #define WC_ENC wxFONTENCODING_UTF32
2017 #elif SIZEOF_WCHAR_T == 2
2018 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2019 #define WC_ENC wxFONTENCODING_UTF16
2020 #else // sizeof(wchar_t) != 2 nor 4
2021 // does this ever happen?
2022 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2025 // ----------------------------------------------------------------------------
2026 // wxMBConv_iconv: encapsulates an iconv character set
2027 // ----------------------------------------------------------------------------
2029 class wxMBConv_iconv
: public wxMBConv
2032 wxMBConv_iconv(const char *name
);
2033 virtual ~wxMBConv_iconv();
2035 // implement base class virtual methods
2036 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
2037 const char *src
, size_t srcLen
= wxNO_LEN
) const;
2038 virtual size_t FromWChar(char *dst
, size_t dstLen
,
2039 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
2040 virtual size_t GetMBNulLen() const;
2042 #if wxUSE_UNICODE_UTF8
2043 virtual bool IsUTF8() const;
2046 virtual wxMBConv
*Clone() const
2048 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
2049 p
->m_minMBCharWidth
= m_minMBCharWidth
;
2054 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
2057 // the iconv handlers used to translate from multibyte
2058 // to wide char and in the other direction
2063 // guards access to m2w and w2m objects
2064 wxMutex m_iconvMutex
;
2068 // the name (for iconv_open()) of a wide char charset -- if none is
2069 // available on this machine, it will remain NULL
2070 static wxString ms_wcCharsetName
;
2072 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2073 // different endian-ness than the native one
2074 static bool ms_wcNeedsSwap
;
2077 // name of the encoding handled by this conversion
2080 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2082 size_t m_minMBCharWidth
;
2085 // make the constructor available for unit testing
2086 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2088 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2089 if ( !result
->IsOk() )
2098 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2099 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2101 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2104 m_minMBCharWidth
= 0;
2106 // check for charset that represents wchar_t:
2107 if ( ms_wcCharsetName
.empty() )
2109 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
2112 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2113 #else // !wxUSE_FONTMAP
2114 static const wxChar
*names_static
[] =
2116 #if SIZEOF_WCHAR_T == 4
2118 #elif SIZEOF_WCHAR_T = 2
2123 const wxChar
**names
= names_static
;
2124 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2126 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2128 const wxString
nameCS(*names
);
2130 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2131 wxString
nameXE(nameCS
);
2133 #ifdef WORDS_BIGENDIAN
2135 #else // little endian
2139 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2142 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2143 if ( m2w
== ICONV_T_INVALID
)
2145 // try charset w/o bytesex info (e.g. "UCS4")
2146 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2148 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2150 // and check for bytesex ourselves:
2151 if ( m2w
!= ICONV_T_INVALID
)
2153 char buf
[2], *bufPtr
;
2162 outsz
= SIZEOF_WCHAR_T
* 2;
2163 char* wbufPtr
= (char*)wbuf
;
2167 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2170 if (ICONV_FAILED(res
, insz
))
2172 wxLogLastError(wxT("iconv"));
2173 wxLogError(_("Conversion to charset '%s' doesn't work."),
2176 else // ok, can convert to this encoding, remember it
2178 ms_wcCharsetName
= nameCS
;
2179 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2183 else // use charset not requiring byte swapping
2185 ms_wcCharsetName
= nameXE
;
2189 wxLogTrace(TRACE_STRCONV
,
2190 wxT("iconv wchar_t charset is \"%s\"%s"),
2191 ms_wcCharsetName
.empty() ? wxString("<none>")
2193 ms_wcNeedsSwap
? _T(" (needs swap)")
2196 else // we already have ms_wcCharsetName
2198 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2201 if ( ms_wcCharsetName
.empty() )
2203 w2m
= ICONV_T_INVALID
;
2207 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2208 if ( w2m
== ICONV_T_INVALID
)
2210 wxLogTrace(TRACE_STRCONV
,
2211 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2212 ms_wcCharsetName
.c_str(), name
);
2217 wxMBConv_iconv::~wxMBConv_iconv()
2219 if ( m2w
!= ICONV_T_INVALID
)
2221 if ( w2m
!= ICONV_T_INVALID
)
2226 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2227 const char *src
, size_t srcLen
) const
2229 if ( srcLen
== wxNO_LEN
)
2231 // find the string length: notice that must be done differently for
2232 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2234 const size_t nulLen
= GetMBNulLen();
2238 return wxCONV_FAILED
;
2241 srcLen
= strlen(src
); // arguably more optimized than our version
2246 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2247 // but they also have to start at character boundary and not
2248 // span two adjacent characters
2250 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2256 // when we're determining the length of the string ourselves we count
2257 // the terminating NUL(s) as part of it and always NUL-terminate the
2262 // we express length in the number of (wide) characters but iconv always
2263 // counts buffer sizes it in bytes
2264 dstLen
*= SIZEOF_WCHAR_T
;
2267 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2268 // Unfortunately there are a couple of global wxCSConv objects such as
2269 // wxConvLocal that are used all over wx code, so we have to make sure
2270 // the handle is used by at most one thread at the time. Otherwise
2271 // only a few wx classes would be safe to use from non-main threads
2272 // as MB<->WC conversion would fail "randomly".
2273 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2274 #endif // wxUSE_THREADS
2277 const char *pszPtr
= src
;
2281 char* bufPtr
= (char*)dst
;
2283 // have destination buffer, convert there
2284 size_t dstLenOrig
= dstLen
;
2286 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2289 // convert the number of bytes converted as returned by iconv to the
2290 // number of (wide) characters converted that we need
2291 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2295 // convert to native endianness
2296 for ( unsigned i
= 0; i
< res
; i
++ )
2297 dst
[i
] = WC_BSWAP(dst
[i
]);
2300 else // no destination buffer
2302 // convert using temp buffer to calculate the size of the buffer needed
2308 char* bufPtr
= (char*)tbuf
;
2309 dstLen
= 8 * SIZEOF_WCHAR_T
;
2312 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2315 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2317 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2320 if (ICONV_FAILED(cres
, srcLen
))
2322 //VS: it is ok if iconv fails, hence trace only
2323 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2324 return wxCONV_FAILED
;
2330 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2331 const wchar_t *src
, size_t srcLen
) const
2334 // NB: explained in MB2WC
2335 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2338 if ( srcLen
== wxNO_LEN
)
2339 srcLen
= wxWcslen(src
) + 1;
2341 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2342 size_t outbuflen
= dstLen
;
2345 wchar_t *tmpbuf
= 0;
2349 // need to copy to temp buffer to switch endianness
2350 // (doing WC_BSWAP twice on the original buffer won't work, as it
2351 // could be in read-only memory, or be accessed in some other thread)
2352 tmpbuf
= (wchar_t *)malloc(inbuflen
);
2353 for ( size_t i
= 0; i
< srcLen
; i
++ )
2354 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2359 char* inbuf
= (char*)src
;
2362 // have destination buffer, convert there
2363 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2365 res
= dstLen
- outbuflen
;
2367 else // no destination buffer
2369 // convert using temp buffer to calculate the size of the buffer needed
2375 outbuflen
= WXSIZEOF(tbuf
);
2377 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2379 res
+= WXSIZEOF(tbuf
) - outbuflen
;
2381 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2389 if (ICONV_FAILED(cres
, inbuflen
))
2391 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2392 return wxCONV_FAILED
;
2398 size_t wxMBConv_iconv::GetMBNulLen() const
2400 if ( m_minMBCharWidth
== 0 )
2402 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2405 // NB: explained in MB2WC
2406 wxMutexLocker
lock(self
->m_iconvMutex
);
2409 const wchar_t *wnul
= L
"";
2410 char buf
[8]; // should be enough for NUL in any encoding
2411 size_t inLen
= sizeof(wchar_t),
2412 outLen
= WXSIZEOF(buf
);
2413 char *inBuff
= (char *)wnul
;
2414 char *outBuff
= buf
;
2415 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2417 self
->m_minMBCharWidth
= (size_t)-1;
2421 self
->m_minMBCharWidth
= outBuff
- buf
;
2425 return m_minMBCharWidth
;
2428 #if wxUSE_UNICODE_UTF8
2429 bool wxMBConv_iconv::IsUTF8() const
2431 return wxStricmp(m_name
, "UTF-8") == 0 ||
2432 wxStricmp(m_name
, "UTF8") == 0;
2436 #endif // HAVE_ICONV
2439 // ============================================================================
2440 // Win32 conversion classes
2441 // ============================================================================
2443 #ifdef wxHAVE_WIN32_MB2WC
2447 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2448 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2451 class wxMBConv_win32
: public wxMBConv
2456 m_CodePage
= CP_ACP
;
2457 m_minMBCharWidth
= 0;
2460 wxMBConv_win32(const wxMBConv_win32
& conv
)
2463 m_CodePage
= conv
.m_CodePage
;
2464 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2468 wxMBConv_win32(const char* name
)
2470 m_CodePage
= wxCharsetToCodepage(name
);
2471 m_minMBCharWidth
= 0;
2474 wxMBConv_win32(wxFontEncoding encoding
)
2476 m_CodePage
= wxEncodingToCodepage(encoding
);
2477 m_minMBCharWidth
= 0;
2479 #endif // wxUSE_FONTMAP
2481 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2483 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2484 // the behaviour is not compatible with the Unix version (using iconv)
2485 // and break the library itself, e.g. wxTextInputStream::NextChar()
2486 // wouldn't work if reading an incomplete MB char didn't result in an
2489 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2490 // Win XP or newer and it is not supported for UTF-[78] so we always
2491 // use our own conversions in this case. See
2492 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2493 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2494 if ( m_CodePage
== CP_UTF8
)
2496 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2499 if ( m_CodePage
== CP_UTF7
)
2501 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2505 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2506 IsAtLeastWin2kSP4() )
2508 flags
= MB_ERR_INVALID_CHARS
;
2511 const size_t len
= ::MultiByteToWideChar
2513 m_CodePage
, // code page
2514 flags
, // flags: fall on error
2515 psz
, // input string
2516 -1, // its length (NUL-terminated)
2517 buf
, // output string
2518 buf
? n
: 0 // size of output buffer
2522 // function totally failed
2523 return wxCONV_FAILED
;
2526 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2527 // check if we succeeded, by doing a double trip:
2528 if ( !flags
&& buf
)
2530 const size_t mbLen
= strlen(psz
);
2531 wxCharBuffer
mbBuf(mbLen
);
2532 if ( ::WideCharToMultiByte
2539 mbLen
+ 1, // size in bytes, not length
2543 strcmp(mbBuf
, psz
) != 0 )
2545 // we didn't obtain the same thing we started from, hence
2546 // the conversion was lossy and we consider that it failed
2547 return wxCONV_FAILED
;
2551 // note that it returns count of written chars for buf != NULL and size
2552 // of the needed buffer for buf == NULL so in either case the length of
2553 // the string (which never includes the terminating NUL) is one less
2557 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2560 we have a problem here: by default, WideCharToMultiByte() may
2561 replace characters unrepresentable in the target code page with bad
2562 quality approximations such as turning "1/2" symbol (U+00BD) into
2563 "1" for the code pages which don't have it and we, obviously, want
2564 to avoid this at any price
2566 the trouble is that this function does it _silently_, i.e. it won't
2567 even tell us whether it did or not... Win98/2000 and higher provide
2568 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2569 we have to resort to a round trip, i.e. check that converting back
2570 results in the same string -- this is, of course, expensive but
2571 otherwise we simply can't be sure to not garble the data.
2574 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2575 // it doesn't work with CJK encodings (which we test for rather roughly
2576 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2578 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2581 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2583 // it's our lucky day
2584 flags
= WC_NO_BEST_FIT_CHARS
;
2585 pUsedDef
= &usedDef
;
2587 else // old system or unsupported encoding
2593 const size_t len
= ::WideCharToMultiByte
2595 m_CodePage
, // code page
2596 flags
, // either none or no best fit
2597 pwz
, // input string
2598 -1, // it is (wide) NUL-terminated
2599 buf
, // output buffer
2600 buf
? n
: 0, // and its size
2601 NULL
, // default "replacement" char
2602 pUsedDef
// [out] was it used?
2607 // function totally failed
2608 return wxCONV_FAILED
;
2611 // we did something, check if we really succeeded
2614 // check if the conversion failed, i.e. if any replacements
2617 return wxCONV_FAILED
;
2619 else // we must resort to double tripping...
2621 // first we need to ensure that we really have the MB data: this is
2622 // not the case if we're called with NULL buffer, in which case we
2623 // need to do the conversion yet again
2624 wxCharBuffer bufDef
;
2627 bufDef
= wxCharBuffer(len
);
2628 buf
= bufDef
.data();
2629 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2630 buf
, len
, NULL
, NULL
) )
2631 return wxCONV_FAILED
;
2636 wxWCharBuffer
wcBuf(n
);
2637 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2638 wcscmp(wcBuf
, pwz
) != 0 )
2640 // we didn't obtain the same thing we started from, hence
2641 // the conversion was lossy and we consider that it failed
2642 return wxCONV_FAILED
;
2646 // see the comment above for the reason of "len - 1"
2650 virtual size_t GetMBNulLen() const
2652 if ( m_minMBCharWidth
== 0 )
2654 int len
= ::WideCharToMultiByte
2656 m_CodePage
, // code page
2658 L
"", // input string
2659 1, // translate just the NUL
2660 NULL
, // output buffer
2662 NULL
, // no replacement char
2663 NULL
// [out] don't care if it was used
2666 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2670 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2671 self
->m_minMBCharWidth
= (size_t)-1;
2675 self
->m_minMBCharWidth
= (size_t)-1;
2681 self
->m_minMBCharWidth
= len
;
2686 return m_minMBCharWidth
;
2689 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2691 bool IsOk() const { return m_CodePage
!= -1; }
2694 static bool CanUseNoBestFit()
2696 static int s_isWin98Or2k
= -1;
2698 if ( s_isWin98Or2k
== -1 )
2701 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2703 case wxOS_WINDOWS_9X
:
2704 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2707 case wxOS_WINDOWS_NT
:
2708 s_isWin98Or2k
= verMaj
>= 5;
2712 // unknown: be conservative by default
2717 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2720 return s_isWin98Or2k
== 1;
2723 static bool IsAtLeastWin2kSP4()
2728 static int s_isAtLeastWin2kSP4
= -1;
2730 if ( s_isAtLeastWin2kSP4
== -1 )
2732 OSVERSIONINFOEX ver
;
2734 memset(&ver
, 0, sizeof(ver
));
2735 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2736 GetVersionEx((OSVERSIONINFO
*)&ver
);
2738 s_isAtLeastWin2kSP4
=
2739 ((ver
.dwMajorVersion
> 5) || // Vista+
2740 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2741 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2742 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2746 return s_isAtLeastWin2kSP4
== 1;
2751 // the code page we're working with
2754 // cached result of GetMBNulLen(), set to 0 initially meaning
2756 size_t m_minMBCharWidth
;
2759 #endif // wxHAVE_WIN32_MB2WC
2762 // ============================================================================
2763 // wxEncodingConverter based conversion classes
2764 // ============================================================================
2768 class wxMBConv_wxwin
: public wxMBConv
2773 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2774 // The wxMBConv_cf class does a better job.
2775 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2776 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2777 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2781 // temporarily just use wxEncodingConverter stuff,
2782 // so that it works while a better implementation is built
2783 wxMBConv_wxwin(const char* name
)
2786 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2788 m_enc
= wxFONTENCODING_SYSTEM
;
2793 wxMBConv_wxwin(wxFontEncoding enc
)
2800 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2802 size_t inbuf
= strlen(psz
);
2805 if (!m2w
.Convert(psz
, buf
))
2806 return wxCONV_FAILED
;
2811 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2813 const size_t inbuf
= wxWcslen(psz
);
2816 if (!w2m
.Convert(psz
, buf
))
2817 return wxCONV_FAILED
;
2823 virtual size_t GetMBNulLen() const
2827 case wxFONTENCODING_UTF16BE
:
2828 case wxFONTENCODING_UTF16LE
:
2831 case wxFONTENCODING_UTF32BE
:
2832 case wxFONTENCODING_UTF32LE
:
2840 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2842 bool IsOk() const { return m_ok
; }
2845 wxFontEncoding m_enc
;
2846 wxEncodingConverter m2w
, w2m
;
2849 // were we initialized successfully?
2852 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2855 // make the constructors available for unit testing
2856 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2858 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2859 if ( !result
->IsOk() )
2868 #endif // wxUSE_FONTMAP
2870 // ============================================================================
2871 // wxCSConv implementation
2872 // ============================================================================
2874 void wxCSConv::Init()
2881 wxCSConv::wxCSConv(const wxString
& charset
)
2885 if ( !charset
.empty() )
2887 SetName(charset
.ToAscii());
2891 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2892 if ( m_encoding
== wxFONTENCODING_MAX
)
2894 // set to unknown/invalid value
2895 m_encoding
= wxFONTENCODING_SYSTEM
;
2897 else if ( m_encoding
== wxFONTENCODING_DEFAULT
)
2899 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2900 m_encoding
= wxFONTENCODING_ISO8859_1
;
2903 m_encoding
= wxFONTENCODING_SYSTEM
;
2907 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2909 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2911 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2913 encoding
= wxFONTENCODING_SYSTEM
;
2918 m_encoding
= encoding
;
2921 wxCSConv::~wxCSConv()
2926 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2931 SetName(conv
.m_name
);
2932 m_encoding
= conv
.m_encoding
;
2935 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2939 SetName(conv
.m_name
);
2940 m_encoding
= conv
.m_encoding
;
2945 void wxCSConv::Clear()
2954 void wxCSConv::SetName(const char *charset
)
2958 m_name
= wxStrdup(charset
);
2965 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2966 wxEncodingNameCache
);
2968 static wxEncodingNameCache gs_nameCache
;
2971 wxMBConv
*wxCSConv::DoCreate() const
2974 wxLogTrace(TRACE_STRCONV
,
2975 wxT("creating conversion for %s"),
2977 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2978 #endif // wxUSE_FONTMAP
2980 // check for the special case of ASCII or ISO8859-1 charset: as we have
2981 // special knowledge of it anyhow, we don't need to create a special
2982 // conversion object
2983 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2984 m_encoding
== wxFONTENCODING_DEFAULT
)
2986 // don't convert at all
2990 // we trust OS to do conversion better than we can so try external
2991 // conversion methods first
2993 // the full order is:
2994 // 1. OS conversion (iconv() under Unix or Win32 API)
2995 // 2. hard coded conversions for UTF
2996 // 3. wxEncodingConverter as fall back
3002 #endif // !wxUSE_FONTMAP
3005 wxFontEncoding
encoding(m_encoding
);
3010 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
3018 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3019 #endif // wxUSE_FONTMAP
3023 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3024 if ( it
!= gs_nameCache
.end() )
3026 if ( it
->second
.empty() )
3029 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
3036 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3037 // CS : in case this does not return valid names (eg for MacRoman)
3038 // encoding got a 'failure' entry in the cache all the same,
3039 // although it just has to be created using a different method, so
3040 // only store failed iconv creation attempts (or perhaps we
3041 // shoulnd't do this at all ?)
3042 if ( names
[0] != NULL
)
3044 for ( ; *names
; ++names
)
3046 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3047 // will need changes that will obsolete this
3048 wxString
name(*names
);
3049 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
3052 gs_nameCache
[encoding
] = *names
;
3059 gs_nameCache
[encoding
] = _T(""); // cache the failure
3062 #endif // wxUSE_FONTMAP
3064 #endif // HAVE_ICONV
3066 #ifdef wxHAVE_WIN32_MB2WC
3069 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3070 : new wxMBConv_win32(m_encoding
);
3079 #endif // wxHAVE_WIN32_MB2WC
3083 // leave UTF16 and UTF32 to the built-ins of wx
3084 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3085 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3088 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3089 : new wxMBConv_cf(m_encoding
);
3091 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3100 #endif // __DARWIN__
3103 wxFontEncoding enc
= m_encoding
;
3105 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3107 // use "false" to suppress interactive dialogs -- we can be called from
3108 // anywhere and popping up a dialog from here is the last thing we want to
3110 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3112 #endif // wxUSE_FONTMAP
3116 case wxFONTENCODING_UTF7
:
3117 return new wxMBConvUTF7
;
3119 case wxFONTENCODING_UTF8
:
3120 return new wxMBConvUTF8
;
3122 case wxFONTENCODING_UTF16BE
:
3123 return new wxMBConvUTF16BE
;
3125 case wxFONTENCODING_UTF16LE
:
3126 return new wxMBConvUTF16LE
;
3128 case wxFONTENCODING_UTF32BE
:
3129 return new wxMBConvUTF32BE
;
3131 case wxFONTENCODING_UTF32LE
:
3132 return new wxMBConvUTF32LE
;
3135 // nothing to do but put here to suppress gcc warnings
3142 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3143 : new wxMBConv_wxwin(m_encoding
);
3149 #endif // wxUSE_FONTMAP
3151 // NB: This is a hack to prevent deadlock. What could otherwise happen
3152 // in Unicode build: wxConvLocal creation ends up being here
3153 // because of some failure and logs the error. But wxLog will try to
3154 // attach a timestamp, for which it will need wxConvLocal (to convert
3155 // time to char* and then wchar_t*), but that fails, tries to log the
3156 // error, but wxLog has an (already locked) critical section that
3157 // guards the static buffer.
3158 static bool alreadyLoggingError
= false;
3159 if (!alreadyLoggingError
)
3161 alreadyLoggingError
= true;
3162 wxLogError(_("Cannot convert from the charset '%s'!"),
3166 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
3167 #else // !wxUSE_FONTMAP
3168 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
3169 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3172 alreadyLoggingError
= false;
3178 void wxCSConv::CreateConvIfNeeded() const
3182 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3184 // if we don't have neither the name nor the encoding, use the default
3185 // encoding for this system
3186 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3189 self
->m_encoding
= wxLocale::GetSystemEncoding();
3191 // fallback to some reasonable default:
3192 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3193 #endif // wxUSE_INTL
3196 self
->m_convReal
= DoCreate();
3197 self
->m_deferred
= false;
3201 bool wxCSConv::IsOk() const
3203 CreateConvIfNeeded();
3205 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3206 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3207 return true; // always ok as we do it ourselves
3209 // m_convReal->IsOk() is called at its own creation, so we know it must
3210 // be ok if m_convReal is non-NULL
3211 return m_convReal
!= NULL
;
3214 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3215 const char *src
, size_t srcLen
) const
3217 CreateConvIfNeeded();
3220 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3223 if ( srcLen
== wxNO_LEN
)
3224 srcLen
= strlen(src
) + 1; // take trailing NUL too
3228 if ( dstLen
< srcLen
)
3229 return wxCONV_FAILED
;
3231 for ( size_t n
= 0; n
< srcLen
; n
++ )
3232 dst
[n
] = (unsigned char)(src
[n
]);
3238 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3239 const wchar_t *src
, size_t srcLen
) const
3241 CreateConvIfNeeded();
3244 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3247 if ( srcLen
== wxNO_LEN
)
3248 srcLen
= wxWcslen(src
) + 1;
3252 if ( dstLen
< srcLen
)
3253 return wxCONV_FAILED
;
3255 for ( size_t n
= 0; n
< srcLen
; n
++ )
3257 if ( src
[n
] > 0xFF )
3258 return wxCONV_FAILED
;
3260 dst
[n
] = (char)src
[n
];
3264 else // still need to check the input validity
3266 for ( size_t n
= 0; n
< srcLen
; n
++ )
3268 if ( src
[n
] > 0xFF )
3269 return wxCONV_FAILED
;
3276 size_t wxCSConv::GetMBNulLen() const
3278 CreateConvIfNeeded();
3282 return m_convReal
->GetMBNulLen();
3285 // otherwise, we are ISO-8859-1
3289 #if wxUSE_UNICODE_UTF8
3290 bool wxCSConv::IsUTF8() const
3292 CreateConvIfNeeded();
3296 return m_convReal
->IsUTF8();
3299 // otherwise, we are ISO-8859-1
3307 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3310 return wxWCharBuffer();
3312 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3314 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3316 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3321 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3324 return wxCharBuffer();
3326 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3328 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3333 #endif // wxUSE_UNICODE
3335 // ----------------------------------------------------------------------------
3337 // ----------------------------------------------------------------------------
3339 // NB: The reason why we create converted objects in this convoluted way,
3340 // using a factory function instead of global variable, is that they
3341 // may be used at static initialization time (some of them are used by
3342 // wxString ctors and there may be a global wxString object). In other
3343 // words, possibly _before_ the converter global object would be
3350 #undef wxConvISO8859_1
3352 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3353 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3354 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3356 static impl_klass name##Obj ctor_args; \
3357 return &name##Obj; \
3359 /* this ensures that all global converter objects are created */ \
3360 /* by the time static initialization is done, i.e. before any */ \
3361 /* thread is launched: */ \
3362 static klass* gs_##name##instance = wxGet_##name##Ptr()
3364 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3365 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3368 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3370 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3373 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3374 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3375 // provokes an error message about "not enough macro parameters"; and we
3376 // can't use "()" here as the name##Obj declaration would be parsed as a
3377 // function declaration then, so use a semicolon and live with an extra
3378 // empty statement (and hope that no compilers warns about this)
3379 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3380 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3382 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3383 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3385 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3386 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3389 // The xnu kernel always communicates file paths in decomposed UTF-8.
3390 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3391 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3394 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3397 #else // !__DARWIN__
3398 wxGet_wxConvLibcPtr();
3399 #endif // __DARWIN__/!__DARWIN__
3401 #else // !wxUSE_WCHAR_T
3403 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3404 // stand-ins in absence of wchar_t
3405 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3410 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T