1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
39 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
40 #include "wx/msw/private.h"
41 #include "wx/msw/missing.h"
42 #define wxHAVE_WIN32_MB2WC
47 #include "wx/thread.h"
50 #include "wx/encconv.h"
51 #include "wx/fontmap.h"
54 #include "wx/osx/core/private/strconv_cf.h"
55 #endif //def __DARWIN__
58 #define TRACE_STRCONV wxT("strconv")
60 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
62 #if SIZEOF_WCHAR_T == 2
67 // ============================================================================
69 // ============================================================================
71 // helper function of cMB2WC(): check if n bytes at this location are all NUL
72 static bool NotAllNULs(const char *p
, size_t n
)
74 while ( n
&& *p
++ == '\0' )
80 // ----------------------------------------------------------------------------
81 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
82 // ----------------------------------------------------------------------------
84 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
89 *output
= (wxUint16
) input
;
93 else if (input
>= 0x110000)
101 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
102 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
109 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
111 if ((*input
< 0xd800) || (*input
> 0xdfff))
116 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
119 return wxCONV_FAILED
;
123 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
129 typedef wchar_t wxDecodeSurrogate_t
;
131 typedef wxUint16 wxDecodeSurrogate_t
;
132 #endif // WC_UTF16/!WC_UTF16
134 // returns the next UTF-32 character from the wchar_t buffer and advances the
135 // pointer to the character after this one
137 // if an invalid character is found, *pSrc is set to NULL, the caller must
139 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
143 n
= decode_utf16(reinterpret_cast<const wxUint16
*>(*pSrc
), out
);
144 if ( n
== wxCONV_FAILED
)
152 // ----------------------------------------------------------------------------
154 // ----------------------------------------------------------------------------
157 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
158 const char *src
, size_t srcLen
) const
160 // although new conversion classes are supposed to implement this function
161 // directly, the existing ones only implement the old MB2WC() and so, to
162 // avoid to have to rewrite all conversion classes at once, we provide a
163 // default (but not efficient) implementation of this one in terms of the
164 // old function by copying the input to ensure that it's NUL-terminated and
165 // then using MB2WC() to convert it
167 // moreover, some conversion classes simply can't implement ToWChar()
168 // directly, the primary example is wxConvLibc: mbstowcs() only handles
169 // NUL-terminated strings
171 // the number of chars [which would be] written to dst [if it were not NULL]
172 size_t dstWritten
= 0;
174 // the number of NULs terminating this string
175 size_t nulLen
= 0; // not really needed, but just to avoid warnings
177 // if we were not given the input size we just have to assume that the
178 // string is properly terminated as we have no way of knowing how long it
179 // is anyhow, but if we do have the size check whether there are enough
183 if ( srcLen
!= wxNO_LEN
)
185 // we need to know how to find the end of this string
186 nulLen
= GetMBNulLen();
187 if ( nulLen
== wxCONV_FAILED
)
188 return wxCONV_FAILED
;
190 // if there are enough NULs we can avoid the copy
191 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
193 // make a copy in order to properly NUL-terminate the string
194 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
195 char * const p
= bufTmp
.data();
196 memcpy(p
, src
, srcLen
);
197 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
203 srcEnd
= src
+ srcLen
;
205 else // quit after the first loop iteration
210 // the idea of this code is straightforward: it converts a NUL-terminated
211 // chunk of the string during each iteration and updates the output buffer
214 // all the complication come from the fact that this function, for
215 // historical reasons, must behave in 2 subtly different ways when it's
216 // called with a fixed number of characters and when it's called for the
217 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
218 // must count all characters we convert, NUL or not; but in the latter we
219 // do not count the trailing NUL -- but still count all the NULs inside the
222 // so for the (simple) former case we just always count the trailing NUL,
223 // but for the latter we need to wait until we see if there is going to be
224 // another loop iteration and only count it then
227 // try to convert the current chunk
228 size_t lenChunk
= MB2WC(NULL
, src
, 0);
229 if ( lenChunk
== wxCONV_FAILED
)
230 return wxCONV_FAILED
;
232 dstWritten
+= lenChunk
;
238 // nothing left in the input string, conversion succeeded
244 if ( dstWritten
> dstLen
)
245 return wxCONV_FAILED
;
247 // +1 is for trailing NUL
248 if ( MB2WC(dst
, src
, lenChunk
+ 1) == wxCONV_FAILED
)
249 return wxCONV_FAILED
;
258 // we convert just one chunk in this case as this is the entire
259 // string anyhow (and we don't count the trailing NUL in this case)
263 // advance the input pointer past the end of this chunk: notice that we
264 // will always stop before srcEnd because we know that the chunk is
265 // always properly NUL-terminated
266 while ( NotAllNULs(src
, nulLen
) )
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
275 // if the buffer ends before this NUL, we shouldn't count it in our
276 // output so skip the code below
280 // do count this terminator as it's inside the buffer we convert
285 src
+= nulLen
; // skip the terminator itself
295 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
296 const wchar_t *src
, size_t srcLen
) const
298 // the number of chars [which would be] written to dst [if it were not NULL]
299 size_t dstWritten
= 0;
301 // if we don't know its length we have no choice but to assume that it is
302 // NUL-terminated (notice that it can still be NUL-terminated even if
303 // explicit length is given but it doesn't change our return value)
304 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
306 // make a copy of the input string unless it is already properly
308 wxWCharBuffer bufTmp
;
309 if ( isNulTerminated
)
311 srcLen
= wxWcslen(src
) + 1;
313 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
315 // make a copy in order to properly NUL-terminate the string
316 bufTmp
= wxWCharBuffer(srcLen
);
317 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
321 const size_t lenNul
= GetMBNulLen();
322 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
324 src
++ /* skip L'\0' too */ )
326 // try to convert the current chunk
327 size_t lenChunk
= WC2MB(NULL
, src
, 0);
328 if ( lenChunk
== wxCONV_FAILED
)
329 return wxCONV_FAILED
;
331 dstWritten
+= lenChunk
;
333 const wchar_t * const
334 chunkEnd
= isNulTerminated
? srcEnd
- 1 : src
+ wxWcslen(src
);
336 // our return value accounts for the trailing NUL(s), unlike that of
337 // WC2MB(), however don't do it for the last NUL we artificially added
339 if ( chunkEnd
< srcEnd
)
340 dstWritten
+= lenNul
;
344 if ( dstWritten
> dstLen
)
345 return wxCONV_FAILED
;
347 // if we know that there is enough space in the destination buffer
348 // (because we accounted for lenNul in dstWritten above), we can
349 // convert directly in place -- but otherwise we need another
350 // temporary buffer to ensure that we don't overwrite the output
353 if ( chunkEnd
== srcEnd
)
355 dstBuf
= wxCharBuffer(lenChunk
+ lenNul
- 1);
356 dstTmp
= dstBuf
.data();
363 if ( WC2MB(dstTmp
, src
, lenChunk
+ lenNul
) == wxCONV_FAILED
)
364 return wxCONV_FAILED
;
368 // copy everything up to but excluding the terminating NUL(s)
369 // into the real output buffer
370 memcpy(dst
, dstTmp
, lenChunk
);
372 // micro-optimization: if dstTmp != dst it means that chunkEnd
373 // == srcEnd and so we're done, no need to update anything below
378 if ( chunkEnd
< srcEnd
)
388 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
390 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
391 if ( rc
!= wxCONV_FAILED
)
393 // ToWChar() returns the buffer length, i.e. including the trailing
394 // NUL, while this method doesn't take it into account
401 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
403 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
404 if ( rc
!= wxCONV_FAILED
)
412 wxMBConv::~wxMBConv()
414 // nothing to do here (necessary for Darwin linking probably)
417 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
421 // calculate the length of the buffer needed first
422 const size_t nLen
= ToWChar(NULL
, 0, psz
);
423 if ( nLen
!= wxCONV_FAILED
)
425 // now do the actual conversion
426 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
428 // +1 for the trailing NULL
429 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
434 return wxWCharBuffer();
437 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
441 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
442 if ( nLen
!= wxCONV_FAILED
)
444 wxCharBuffer
buf(nLen
- 1);
445 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
450 return wxCharBuffer();
454 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
456 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
457 if ( dstLen
!= wxCONV_FAILED
)
459 // notice that we allocate space for dstLen+1 wide characters here
460 // because we want the buffer to always be NUL-terminated, even if the
461 // input isn't (as otherwise the caller has no way to know its length)
462 wxWCharBuffer
wbuf(dstLen
);
463 wbuf
.data()[dstLen
] = L
'\0';
464 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
470 // we also need to handle NUL-terminated input strings
471 // specially: for them the output is the length of the string
472 // excluding the trailing NUL, however if we're asked to
473 // convert a specific number of characters we return the length
474 // of the resulting output even if it's NUL-terminated
475 if ( inLen
== wxNO_LEN
)
486 return wxWCharBuffer();
490 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
492 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
493 if ( dstLen
!= wxCONV_FAILED
)
495 const size_t nulLen
= GetMBNulLen();
497 // as above, ensure that the buffer is always NUL-terminated, even if
499 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
500 memset(buf
.data() + dstLen
, 0, nulLen
);
501 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
507 if ( inLen
== wxNO_LEN
)
509 // in this case both input and output are NUL-terminated
510 // and we're not supposed to count NUL
522 return wxCharBuffer();
525 const wxWCharBuffer
wxMBConv::cMB2WC(const wxScopedCharBuffer
& buf
) const
527 const size_t srcLen
= buf
.length();
530 const size_t dstLen
= ToWChar(NULL
, 0, buf
, srcLen
);
531 if ( dstLen
!= wxCONV_FAILED
)
533 wxWCharBuffer
wbuf(dstLen
);
534 wbuf
.data()[dstLen
] = L
'\0';
535 if ( ToWChar(wbuf
.data(), dstLen
, buf
, srcLen
) != wxCONV_FAILED
)
540 return wxScopedWCharBuffer::CreateNonOwned(L
"", 0);
543 const wxCharBuffer
wxMBConv::cWC2MB(const wxScopedWCharBuffer
& wbuf
) const
545 const size_t srcLen
= wbuf
.length();
548 const size_t dstLen
= FromWChar(NULL
, 0, wbuf
, srcLen
);
549 if ( dstLen
!= wxCONV_FAILED
)
551 wxCharBuffer
buf(dstLen
);
552 buf
.data()[dstLen
] = '\0';
553 if ( FromWChar(buf
.data(), dstLen
, wbuf
, srcLen
) != wxCONV_FAILED
)
558 return wxScopedCharBuffer::CreateNonOwned("", 0);
561 // ----------------------------------------------------------------------------
563 // ----------------------------------------------------------------------------
565 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
567 return wxMB2WC(buf
, psz
, n
);
570 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
572 return wxWC2MB(buf
, psz
, n
);
575 // ----------------------------------------------------------------------------
576 // wxConvBrokenFileNames
577 // ----------------------------------------------------------------------------
581 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
583 if ( wxStricmp(charset
, wxT("UTF-8")) == 0 ||
584 wxStricmp(charset
, wxT("UTF8")) == 0 )
585 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
587 m_conv
= new wxCSConv(charset
);
592 // ----------------------------------------------------------------------------
594 // ----------------------------------------------------------------------------
596 // Implementation (C) 2004 Fredrik Roubert
598 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
601 // BASE64 decoding table
603 static const unsigned char utf7unb64
[] =
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
611 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
612 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
613 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
614 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
615 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
616 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
617 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
618 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
619 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
620 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
639 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
640 const char *src
, size_t srcLen
) const
642 DecoderState stateOrig
,
644 if ( srcLen
== wxNO_LEN
)
646 // convert the entire string, up to and including the trailing NUL
647 srcLen
= strlen(src
) + 1;
649 // when working on the entire strings we don't update nor use the shift
650 // state from the previous call
651 statePtr
= &stateOrig
;
653 else // when working with partial strings we do use the shift state
655 statePtr
= const_cast<DecoderState
*>(&m_stateDecoder
);
657 // also save the old state to be able to rollback to it on error
658 stateOrig
= m_stateDecoder
;
661 // but to simplify the code below we use this variable in both cases
662 DecoderState
& state
= *statePtr
;
665 // number of characters [which would have been] written to dst [if it were
669 const char * const srcEnd
= src
+ srcLen
;
671 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
673 const unsigned char cc
= *src
++;
675 if ( state
.IsShifted() )
677 const unsigned char dc
= utf7unb64
[cc
];
680 // end of encoded part, check that nothing was left: there can
681 // be up to 4 bits of 0 padding but nothing else (we also need
682 // to check isLSB as we count bits modulo 8 while a valid UTF-7
683 // encoded sequence must contain an integral number of UTF-16
685 if ( state
.isLSB
|| state
.bit
> 4 ||
686 (state
.accum
& ((1 << state
.bit
) - 1)) )
691 return wxCONV_FAILED
;
696 // re-parse this character normally below unless it's '-' which
697 // is consumed by the decoder
701 else // valid encoded character
703 // mini base64 decoder: each character is 6 bits
708 if ( state
.bit
>= 8 )
710 // got the full byte, consume it
712 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
716 // we've got the full word, output it
718 *dst
++ = (state
.msb
<< 8) | b
;
724 // just store it while we wait for LSB
732 if ( state
.IsDirect() )
734 // start of an encoded segment?
739 // just the encoded plus sign, don't switch to shifted mode
745 else if ( utf7unb64
[(unsigned)*src
] == 0xff )
747 // empty encoded chunks are not allowed
751 return wxCONV_FAILED
;
753 else // base-64 encoded chunk follows
760 // only printable 7 bit ASCII characters (with the exception of
761 // NUL, TAB, CR and LF) can be used directly
762 if ( cc
>= 0x7f || (cc
< ' ' &&
763 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
764 return wxCONV_FAILED
;
775 // as we didn't read any characters we should be called with the same
776 // data (followed by some more new data) again later so don't save our
780 return wxCONV_FAILED
;
787 // BASE64 encoding table
789 static const unsigned char utf7enb64
[] =
791 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
792 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
793 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
794 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
795 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
796 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
797 'w', 'x', 'y', 'z', '0', '1', '2', '3',
798 '4', '5', '6', '7', '8', '9', '+', '/'
802 // UTF-7 encoding table
804 // 0 - Set D (directly encoded characters)
805 // 1 - Set O (optional direct characters)
806 // 2 - whitespace characters (optional)
807 // 3 - special characters
809 static const unsigned char utf7encode
[128] =
811 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
813 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
821 static inline bool wxIsUTF7Direct(wchar_t wc
)
823 return wc
< 0x80 && utf7encode
[wc
] < 1;
826 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
827 const wchar_t *src
, size_t srcLen
) const
829 EncoderState stateOrig
,
831 if ( srcLen
== wxNO_LEN
)
833 // we don't apply the stored state when operating on entire strings at
835 statePtr
= &stateOrig
;
837 srcLen
= wxWcslen(src
) + 1;
839 else // do use the mode we left the output in previously
841 stateOrig
= m_stateEncoder
;
842 statePtr
= const_cast<EncoderState
*>(&m_stateEncoder
);
845 EncoderState
& state
= *statePtr
;
850 const wchar_t * const srcEnd
= src
+ srcLen
;
851 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
854 if ( wxIsUTF7Direct(cc
) )
856 if ( state
.IsShifted() )
858 // pad with zeros the last encoded block if necessary
862 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
877 else if ( cc
== '+' && state
.IsDirect() )
888 else if (((wxUint32
)cc
) > 0xffff)
890 // no surrogate pair generation (yet?)
891 return wxCONV_FAILED
;
896 if ( state
.IsDirect() )
905 // BASE64 encode string
908 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
911 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
913 for (state
.bit
+= 8; state
.bit
>= 6; )
917 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
922 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
930 // we need to restore the original encoder state if we were called just to
931 // calculate the amount of space needed as we will presumably be called
932 // again to really convert the data now
939 // ----------------------------------------------------------------------------
941 // ----------------------------------------------------------------------------
943 static const wxUint32 utf8_max
[]=
944 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
946 // boundaries of the private use area we use to (temporarily) remap invalid
947 // characters invalid in a UTF-8 encoded string
948 const wxUint32 wxUnicodePUA
= 0x100000;
949 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
951 // this table gives the length of the UTF-8 encoding from its first character:
952 const unsigned char tableUtf8Lengths
[256] = {
953 // single-byte sequences (ASCII):
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
963 // these are invalid:
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
970 // two-byte sequences:
971 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
972 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
974 // three-byte sequences:
975 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
977 // four-byte sequences:
978 4, 4, 4, 4, 4, // F0..F4
980 // these are invalid again (5- or 6-byte
981 // sequences and sequences for code points
982 // above U+10FFFF, as restricted by RFC 3629):
983 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
987 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
988 const char *src
, size_t srcLen
) const
990 wchar_t *out
= dstLen
? dst
: NULL
;
993 if ( srcLen
== wxNO_LEN
)
994 srcLen
= strlen(src
) + 1;
996 for ( const char *p
= src
; ; p
++ )
998 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
1000 // all done successfully, just add the trailing NULL if we are not
1001 // using explicit length
1002 if ( srcLen
== wxNO_LEN
)
1018 if ( out
&& !dstLen
-- )
1022 unsigned char c
= *p
;
1026 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
1029 if ( srcLen
!= wxNO_LEN
)
1036 unsigned len
= tableUtf8Lengths
[c
];
1040 if ( srcLen
< len
) // the test works for wxNO_LEN too
1043 if ( srcLen
!= wxNO_LEN
)
1046 // Char. number range | UTF-8 octet sequence
1047 // (hexadecimal) | (binary)
1048 // ----------------------+----------------------------------------
1049 // 0000 0000 - 0000 007F | 0xxxxxxx
1050 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1051 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1052 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1054 // Code point value is stored in bits marked with 'x',
1055 // lowest-order bit of the value on the right side in the diagram
1056 // above. (from RFC 3629)
1058 // mask to extract lead byte's value ('x' bits above), by sequence
1060 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1062 // mask and value of lead byte's most significant bits, by length:
1063 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1064 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1066 len
--; // it's more convenient to work with 0-based length here
1068 // extract the lead byte's value bits:
1069 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
1072 code
= c
& leadValueMask
[len
];
1074 // all remaining bytes, if any, are handled in the same way
1075 // regardless of sequence's length:
1076 for ( ; len
; --len
)
1079 if ( (c
& 0xC0) != 0x80 )
1080 return wxCONV_FAILED
;
1088 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1089 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
1098 #endif // WC_UTF16/!WC_UTF16
1106 return wxCONV_FAILED
;
1110 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
1111 const wchar_t *src
, size_t srcLen
) const
1113 char *out
= dstLen
? dst
: NULL
;
1116 for ( const wchar_t *wp
= src
; ; wp
++ )
1118 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
) )
1120 // all done successfully, just add the trailing NULL if we are not
1121 // using explicit length
1122 if ( srcLen
== wxNO_LEN
)
1138 if ( srcLen
!= wxNO_LEN
)
1143 // cast is ok for WC_UTF16
1144 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1146 // skip the next char too as we decoded a surrogate
1149 #else // wchar_t is UTF-32
1150 code
= *wp
& 0x7fffffff;
1162 out
[0] = (char)code
;
1165 else if ( code
<= 0x07FF )
1173 // NB: this line takes 6 least significant bits, encodes them as
1174 // 10xxxxxx and discards them so that the next byte can be encoded:
1175 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1176 out
[0] = 0xC0 | code
;
1179 else if ( code
< 0xFFFF )
1187 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1188 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1189 out
[0] = 0xE0 | code
;
1192 else if ( code
<= 0x10FFFF )
1200 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1201 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1202 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1203 out
[0] = 0xF0 | code
;
1208 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1221 // we only get here if an error occurs during decoding
1222 return wxCONV_FAILED
;
1225 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1226 const char *psz
, size_t srcLen
) const
1228 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1229 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1233 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1235 const char *opsz
= psz
;
1236 bool invalid
= false;
1237 unsigned char cc
= *psz
++, fc
= cc
;
1239 for (cnt
= 0; fc
& 0x80; cnt
++)
1249 // escape the escape character for octal escapes
1250 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1251 && cc
== '\\' && (!buf
|| len
< n
))
1263 // invalid UTF-8 sequence
1268 unsigned ocnt
= cnt
- 1;
1269 wxUint32 res
= cc
& (0x3f >> cnt
);
1273 if ((cc
& 0xC0) != 0x80)
1275 // invalid UTF-8 sequence
1281 res
= (res
<< 6) | (cc
& 0x3f);
1284 if (invalid
|| res
<= utf8_max
[ocnt
])
1286 // illegal UTF-8 encoding
1289 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1290 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1292 // if one of our PUA characters turns up externally
1293 // it must also be treated as an illegal sequence
1294 // (a bit like you have to escape an escape character)
1300 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1301 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1302 if (pa
== wxCONV_FAILED
)
1314 *buf
++ = (wchar_t)res
;
1316 #endif // WC_UTF16/!WC_UTF16
1322 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1324 while (opsz
< psz
&& (!buf
|| len
< n
))
1327 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1328 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1329 wxASSERT(pa
!= wxCONV_FAILED
);
1336 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1342 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1344 while (opsz
< psz
&& (!buf
|| len
< n
))
1346 if ( buf
&& len
+ 3 < n
)
1348 unsigned char on
= *opsz
;
1350 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1351 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1352 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1359 else // MAP_INVALID_UTF8_NOT
1361 return wxCONV_FAILED
;
1367 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1373 static inline bool isoctal(wchar_t wch
)
1375 return L
'0' <= wch
&& wch
<= L
'7';
1378 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1379 const wchar_t *psz
, size_t srcLen
) const
1381 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1382 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1386 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1391 // cast is ok for WC_UTF16
1392 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1393 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1395 cc
= (*psz
++) & 0x7fffffff;
1398 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1399 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1402 *buf
++ = (char)(cc
- wxUnicodePUA
);
1405 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1406 && cc
== L
'\\' && psz
[0] == L
'\\' )
1413 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1415 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1419 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1420 (psz
[1] - L
'0') * 010 +
1430 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1446 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1448 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1454 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1460 // ============================================================================
1462 // ============================================================================
1464 #ifdef WORDS_BIGENDIAN
1465 #define wxMBConvUTF16straight wxMBConvUTF16BE
1466 #define wxMBConvUTF16swap wxMBConvUTF16LE
1468 #define wxMBConvUTF16swap wxMBConvUTF16BE
1469 #define wxMBConvUTF16straight wxMBConvUTF16LE
1473 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1475 if ( srcLen
== wxNO_LEN
)
1477 // count the number of bytes in input, including the trailing NULs
1478 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1479 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1482 srcLen
*= BYTES_PER_CHAR
;
1484 else // we already have the length
1486 // we can only convert an entire number of UTF-16 characters
1487 if ( srcLen
% BYTES_PER_CHAR
)
1488 return wxCONV_FAILED
;
1494 // case when in-memory representation is UTF-16 too
1497 // ----------------------------------------------------------------------------
1498 // conversions without endianness change
1499 // ----------------------------------------------------------------------------
1502 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1503 const char *src
, size_t srcLen
) const
1505 // set up the scene for using memcpy() (which is presumably more efficient
1506 // than copying the bytes one by one)
1507 srcLen
= GetLength(src
, srcLen
);
1508 if ( srcLen
== wxNO_LEN
)
1509 return wxCONV_FAILED
;
1511 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1514 if ( dstLen
< inLen
)
1515 return wxCONV_FAILED
;
1517 memcpy(dst
, src
, srcLen
);
1524 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1525 const wchar_t *src
, size_t srcLen
) const
1527 if ( srcLen
== wxNO_LEN
)
1528 srcLen
= wxWcslen(src
) + 1;
1530 srcLen
*= BYTES_PER_CHAR
;
1534 if ( dstLen
< srcLen
)
1535 return wxCONV_FAILED
;
1537 memcpy(dst
, src
, srcLen
);
1543 // ----------------------------------------------------------------------------
1544 // endian-reversing conversions
1545 // ----------------------------------------------------------------------------
1548 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1549 const char *src
, size_t srcLen
) const
1551 srcLen
= GetLength(src
, srcLen
);
1552 if ( srcLen
== wxNO_LEN
)
1553 return wxCONV_FAILED
;
1555 srcLen
/= BYTES_PER_CHAR
;
1559 if ( dstLen
< srcLen
)
1560 return wxCONV_FAILED
;
1562 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1563 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1565 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1573 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1574 const wchar_t *src
, size_t srcLen
) const
1576 if ( srcLen
== wxNO_LEN
)
1577 srcLen
= wxWcslen(src
) + 1;
1579 srcLen
*= BYTES_PER_CHAR
;
1583 if ( dstLen
< srcLen
)
1584 return wxCONV_FAILED
;
1586 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1587 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1589 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1596 #else // !WC_UTF16: wchar_t is UTF-32
1598 // ----------------------------------------------------------------------------
1599 // conversions without endianness change
1600 // ----------------------------------------------------------------------------
1603 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1604 const char *src
, size_t srcLen
) const
1606 srcLen
= GetLength(src
, srcLen
);
1607 if ( srcLen
== wxNO_LEN
)
1608 return wxCONV_FAILED
;
1610 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1613 // optimization: return maximal space which could be needed for this
1614 // string even if the real size could be smaller if the buffer contains
1620 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1621 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1623 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1625 return wxCONV_FAILED
;
1627 if ( ++outLen
> dstLen
)
1628 return wxCONV_FAILED
;
1638 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1639 const wchar_t *src
, size_t srcLen
) const
1641 if ( srcLen
== wxNO_LEN
)
1642 srcLen
= wxWcslen(src
) + 1;
1645 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1646 for ( size_t n
= 0; n
< srcLen
; n
++ )
1649 const size_t numChars
= encode_utf16(*src
++, cc
);
1650 if ( numChars
== wxCONV_FAILED
)
1651 return wxCONV_FAILED
;
1653 outLen
+= numChars
* BYTES_PER_CHAR
;
1656 if ( outLen
> dstLen
)
1657 return wxCONV_FAILED
;
1660 if ( numChars
== 2 )
1662 // second character of a surrogate
1671 // ----------------------------------------------------------------------------
1672 // endian-reversing conversions
1673 // ----------------------------------------------------------------------------
1676 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1677 const char *src
, size_t srcLen
) const
1679 srcLen
= GetLength(src
, srcLen
);
1680 if ( srcLen
== wxNO_LEN
)
1681 return wxCONV_FAILED
;
1683 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1686 // optimization: return maximal space which could be needed for this
1687 // string even if the real size could be smaller if the buffer contains
1693 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1694 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1699 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1701 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1703 const size_t numChars
= decode_utf16(tmp
, ch
);
1704 if ( numChars
== wxCONV_FAILED
)
1705 return wxCONV_FAILED
;
1707 if ( numChars
== 2 )
1710 if ( ++outLen
> dstLen
)
1711 return wxCONV_FAILED
;
1721 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1722 const wchar_t *src
, size_t srcLen
) const
1724 if ( srcLen
== wxNO_LEN
)
1725 srcLen
= wxWcslen(src
) + 1;
1728 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1729 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1732 const size_t numChars
= encode_utf16(*src
, cc
);
1733 if ( numChars
== wxCONV_FAILED
)
1734 return wxCONV_FAILED
;
1736 outLen
+= numChars
* BYTES_PER_CHAR
;
1739 if ( outLen
> dstLen
)
1740 return wxCONV_FAILED
;
1742 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1743 if ( numChars
== 2 )
1745 // second character of a surrogate
1746 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1754 #endif // WC_UTF16/!WC_UTF16
1757 // ============================================================================
1759 // ============================================================================
1761 #ifdef WORDS_BIGENDIAN
1762 #define wxMBConvUTF32straight wxMBConvUTF32BE
1763 #define wxMBConvUTF32swap wxMBConvUTF32LE
1765 #define wxMBConvUTF32swap wxMBConvUTF32BE
1766 #define wxMBConvUTF32straight wxMBConvUTF32LE
1770 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1771 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1774 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1776 if ( srcLen
== wxNO_LEN
)
1778 // count the number of bytes in input, including the trailing NULs
1779 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1780 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1783 srcLen
*= BYTES_PER_CHAR
;
1785 else // we already have the length
1787 // we can only convert an entire number of UTF-32 characters
1788 if ( srcLen
% BYTES_PER_CHAR
)
1789 return wxCONV_FAILED
;
1795 // case when in-memory representation is UTF-16
1798 // ----------------------------------------------------------------------------
1799 // conversions without endianness change
1800 // ----------------------------------------------------------------------------
1803 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1804 const char *src
, size_t srcLen
) const
1806 srcLen
= GetLength(src
, srcLen
);
1807 if ( srcLen
== wxNO_LEN
)
1808 return wxCONV_FAILED
;
1810 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1811 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1813 for ( size_t n
= 0; n
< inLen
; n
++ )
1816 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1817 if ( numChars
== wxCONV_FAILED
)
1818 return wxCONV_FAILED
;
1823 if ( outLen
> dstLen
)
1824 return wxCONV_FAILED
;
1827 if ( numChars
== 2 )
1829 // second character of a surrogate
1839 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1840 const wchar_t *src
, size_t srcLen
) const
1842 if ( srcLen
== wxNO_LEN
)
1843 srcLen
= wxWcslen(src
) + 1;
1847 // optimization: return maximal space which could be needed for this
1848 // string instead of the exact amount which could be less if there are
1849 // any surrogates in the input
1851 // we consider that surrogates are rare enough to make it worthwhile to
1852 // avoid running the loop below at the cost of slightly extra memory
1854 return srcLen
* BYTES_PER_CHAR
;
1857 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1859 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1861 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1863 return wxCONV_FAILED
;
1865 outLen
+= BYTES_PER_CHAR
;
1867 if ( outLen
> dstLen
)
1868 return wxCONV_FAILED
;
1876 // ----------------------------------------------------------------------------
1877 // endian-reversing conversions
1878 // ----------------------------------------------------------------------------
1881 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1882 const char *src
, size_t srcLen
) const
1884 srcLen
= GetLength(src
, srcLen
);
1885 if ( srcLen
== wxNO_LEN
)
1886 return wxCONV_FAILED
;
1888 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1889 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1891 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1894 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1895 if ( numChars
== wxCONV_FAILED
)
1896 return wxCONV_FAILED
;
1901 if ( outLen
> dstLen
)
1902 return wxCONV_FAILED
;
1905 if ( numChars
== 2 )
1907 // second character of a surrogate
1917 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1918 const wchar_t *src
, size_t srcLen
) const
1920 if ( srcLen
== wxNO_LEN
)
1921 srcLen
= wxWcslen(src
) + 1;
1925 // optimization: return maximal space which could be needed for this
1926 // string instead of the exact amount which could be less if there are
1927 // any surrogates in the input
1929 // we consider that surrogates are rare enough to make it worthwhile to
1930 // avoid running the loop below at the cost of slightly extra memory
1932 return srcLen
*BYTES_PER_CHAR
;
1935 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1937 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1939 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1941 return wxCONV_FAILED
;
1943 outLen
+= BYTES_PER_CHAR
;
1945 if ( outLen
> dstLen
)
1946 return wxCONV_FAILED
;
1948 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1954 #else // !WC_UTF16: wchar_t is UTF-32
1956 // ----------------------------------------------------------------------------
1957 // conversions without endianness change
1958 // ----------------------------------------------------------------------------
1961 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1962 const char *src
, size_t srcLen
) const
1964 // use memcpy() as it should be much faster than hand-written loop
1965 srcLen
= GetLength(src
, srcLen
);
1966 if ( srcLen
== wxNO_LEN
)
1967 return wxCONV_FAILED
;
1969 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1972 if ( dstLen
< inLen
)
1973 return wxCONV_FAILED
;
1975 memcpy(dst
, src
, srcLen
);
1982 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1983 const wchar_t *src
, size_t srcLen
) const
1985 if ( srcLen
== wxNO_LEN
)
1986 srcLen
= wxWcslen(src
) + 1;
1988 srcLen
*= BYTES_PER_CHAR
;
1992 if ( dstLen
< srcLen
)
1993 return wxCONV_FAILED
;
1995 memcpy(dst
, src
, srcLen
);
2001 // ----------------------------------------------------------------------------
2002 // endian-reversing conversions
2003 // ----------------------------------------------------------------------------
2006 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
2007 const char *src
, size_t srcLen
) const
2009 srcLen
= GetLength(src
, srcLen
);
2010 if ( srcLen
== wxNO_LEN
)
2011 return wxCONV_FAILED
;
2013 srcLen
/= BYTES_PER_CHAR
;
2017 if ( dstLen
< srcLen
)
2018 return wxCONV_FAILED
;
2020 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
2021 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
2023 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
2031 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
2032 const wchar_t *src
, size_t srcLen
) const
2034 if ( srcLen
== wxNO_LEN
)
2035 srcLen
= wxWcslen(src
) + 1;
2037 srcLen
*= BYTES_PER_CHAR
;
2041 if ( dstLen
< srcLen
)
2042 return wxCONV_FAILED
;
2044 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
2045 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
2047 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
2054 #endif // WC_UTF16/!WC_UTF16
2057 // ============================================================================
2058 // The classes doing conversion using the iconv_xxx() functions
2059 // ============================================================================
2063 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2064 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2065 // (unless there's yet another bug in glibc) the only case when iconv()
2066 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2067 // left in the input buffer -- when _real_ error occurs,
2068 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2070 // [This bug does not appear in glibc 2.2.]
2071 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2072 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2073 (errno != E2BIG || bufLeft != 0))
2075 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2078 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2080 #define ICONV_T_INVALID ((iconv_t)-1)
2082 #if SIZEOF_WCHAR_T == 4
2083 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2084 #define WC_ENC wxFONTENCODING_UTF32
2085 #elif SIZEOF_WCHAR_T == 2
2086 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2087 #define WC_ENC wxFONTENCODING_UTF16
2088 #else // sizeof(wchar_t) != 2 nor 4
2089 // does this ever happen?
2090 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2093 // ----------------------------------------------------------------------------
2094 // wxMBConv_iconv: encapsulates an iconv character set
2095 // ----------------------------------------------------------------------------
2097 class wxMBConv_iconv
: public wxMBConv
2100 wxMBConv_iconv(const char *name
);
2101 virtual ~wxMBConv_iconv();
2103 // implement base class virtual methods
2104 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
2105 const char *src
, size_t srcLen
= wxNO_LEN
) const;
2106 virtual size_t FromWChar(char *dst
, size_t dstLen
,
2107 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
2108 virtual size_t GetMBNulLen() const;
2110 #if wxUSE_UNICODE_UTF8
2111 virtual bool IsUTF8() const;
2114 virtual wxMBConv
*Clone() const
2116 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
);
2117 p
->m_minMBCharWidth
= m_minMBCharWidth
;
2122 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
2125 // the iconv handlers used to translate from multibyte
2126 // to wide char and in the other direction
2131 // guards access to m2w and w2m objects
2132 wxMutex m_iconvMutex
;
2136 // the name (for iconv_open()) of a wide char charset -- if none is
2137 // available on this machine, it will remain NULL
2138 static wxString ms_wcCharsetName
;
2140 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2141 // different endian-ness than the native one
2142 static bool ms_wcNeedsSwap
;
2145 // name of the encoding handled by this conversion
2148 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2150 size_t m_minMBCharWidth
;
2153 // make the constructor available for unit testing
2154 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2156 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2157 if ( !result
->IsOk() )
2166 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2167 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2169 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2170 : m_name(wxStrdup(name
))
2172 m_minMBCharWidth
= 0;
2174 // check for charset that represents wchar_t:
2175 if ( ms_wcCharsetName
.empty() )
2177 wxLogTrace(TRACE_STRCONV
, wxT("Looking for wide char codeset:"));
2180 const wxChar
*const *names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2181 #else // !wxUSE_FONTMAP
2182 static const wxChar
*const names_static
[] =
2184 #if SIZEOF_WCHAR_T == 4
2186 #elif SIZEOF_WCHAR_T = 2
2191 const wxChar
*const *names
= names_static
;
2192 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2194 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2196 const wxString
nameCS(*names
);
2198 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2199 wxString
nameXE(nameCS
);
2201 #ifdef WORDS_BIGENDIAN
2202 nameXE
+= wxT("BE");
2203 #else // little endian
2204 nameXE
+= wxT("LE");
2207 wxLogTrace(TRACE_STRCONV
, wxT(" trying charset \"%s\""),
2210 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2211 if ( m2w
== ICONV_T_INVALID
)
2213 // try charset w/o bytesex info (e.g. "UCS4")
2214 wxLogTrace(TRACE_STRCONV
, wxT(" trying charset \"%s\""),
2216 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2218 // and check for bytesex ourselves:
2219 if ( m2w
!= ICONV_T_INVALID
)
2221 char buf
[2], *bufPtr
;
2230 outsz
= SIZEOF_WCHAR_T
* 2;
2231 char* wbufPtr
= (char*)wbuf
;
2235 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2238 if (ICONV_FAILED(res
, insz
))
2240 wxLogLastError(wxT("iconv"));
2241 wxLogError(_("Conversion to charset '%s' doesn't work."),
2244 else // ok, can convert to this encoding, remember it
2246 ms_wcCharsetName
= nameCS
;
2247 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2251 else // use charset not requiring byte swapping
2253 ms_wcCharsetName
= nameXE
;
2257 wxLogTrace(TRACE_STRCONV
,
2258 wxT("iconv wchar_t charset is \"%s\"%s"),
2259 ms_wcCharsetName
.empty() ? wxString("<none>")
2261 ms_wcNeedsSwap
? wxT(" (needs swap)")
2264 else // we already have ms_wcCharsetName
2266 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2269 if ( ms_wcCharsetName
.empty() )
2271 w2m
= ICONV_T_INVALID
;
2275 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2276 if ( w2m
== ICONV_T_INVALID
)
2278 wxLogTrace(TRACE_STRCONV
,
2279 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2280 ms_wcCharsetName
.c_str(), name
);
2285 wxMBConv_iconv::~wxMBConv_iconv()
2287 free(const_cast<char *>(m_name
));
2289 if ( m2w
!= ICONV_T_INVALID
)
2291 if ( w2m
!= ICONV_T_INVALID
)
2296 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2297 const char *src
, size_t srcLen
) const
2299 if ( srcLen
== wxNO_LEN
)
2301 // find the string length: notice that must be done differently for
2302 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2304 const size_t nulLen
= GetMBNulLen();
2308 return wxCONV_FAILED
;
2311 srcLen
= strlen(src
); // arguably more optimized than our version
2316 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2317 // but they also have to start at character boundary and not
2318 // span two adjacent characters
2320 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2326 // when we're determining the length of the string ourselves we count
2327 // the terminating NUL(s) as part of it and always NUL-terminate the
2332 // we express length in the number of (wide) characters but iconv always
2333 // counts buffer sizes it in bytes
2334 dstLen
*= SIZEOF_WCHAR_T
;
2337 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2338 // Unfortunately there are a couple of global wxCSConv objects such as
2339 // wxConvLocal that are used all over wx code, so we have to make sure
2340 // the handle is used by at most one thread at the time. Otherwise
2341 // only a few wx classes would be safe to use from non-main threads
2342 // as MB<->WC conversion would fail "randomly".
2343 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2344 #endif // wxUSE_THREADS
2347 const char *pszPtr
= src
;
2351 char* bufPtr
= (char*)dst
;
2353 // have destination buffer, convert there
2354 size_t dstLenOrig
= dstLen
;
2356 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2359 // convert the number of bytes converted as returned by iconv to the
2360 // number of (wide) characters converted that we need
2361 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2365 // convert to native endianness
2366 for ( unsigned i
= 0; i
< res
; i
++ )
2367 dst
[i
] = WC_BSWAP(dst
[i
]);
2370 else // no destination buffer
2372 // convert using temp buffer to calculate the size of the buffer needed
2378 char* bufPtr
= (char*)tbuf
;
2379 dstLen
= 8 * SIZEOF_WCHAR_T
;
2382 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2385 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2387 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2390 if (ICONV_FAILED(cres
, srcLen
))
2392 //VS: it is ok if iconv fails, hence trace only
2393 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2394 return wxCONV_FAILED
;
2400 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2401 const wchar_t *src
, size_t srcLen
) const
2404 // NB: explained in MB2WC
2405 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2408 if ( srcLen
== wxNO_LEN
)
2409 srcLen
= wxWcslen(src
) + 1;
2411 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2412 size_t outbuflen
= dstLen
;
2415 wchar_t *tmpbuf
= 0;
2419 // need to copy to temp buffer to switch endianness
2420 // (doing WC_BSWAP twice on the original buffer won't work, as it
2421 // could be in read-only memory, or be accessed in some other thread)
2422 tmpbuf
= (wchar_t *)malloc(inbuflen
);
2423 for ( size_t i
= 0; i
< srcLen
; i
++ )
2424 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2429 char* inbuf
= (char*)src
;
2432 // have destination buffer, convert there
2433 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2435 res
= dstLen
- outbuflen
;
2437 else // no destination buffer
2439 // convert using temp buffer to calculate the size of the buffer needed
2445 outbuflen
= WXSIZEOF(tbuf
);
2447 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2449 res
+= WXSIZEOF(tbuf
) - outbuflen
;
2451 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2459 if (ICONV_FAILED(cres
, inbuflen
))
2461 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2462 return wxCONV_FAILED
;
2468 size_t wxMBConv_iconv::GetMBNulLen() const
2470 if ( m_minMBCharWidth
== 0 )
2472 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2475 // NB: explained in MB2WC
2476 wxMutexLocker
lock(self
->m_iconvMutex
);
2479 const wchar_t *wnul
= L
"";
2480 char buf
[8]; // should be enough for NUL in any encoding
2481 size_t inLen
= sizeof(wchar_t),
2482 outLen
= WXSIZEOF(buf
);
2483 char *inBuff
= (char *)wnul
;
2484 char *outBuff
= buf
;
2485 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2487 self
->m_minMBCharWidth
= (size_t)-1;
2491 self
->m_minMBCharWidth
= outBuff
- buf
;
2495 return m_minMBCharWidth
;
2498 #if wxUSE_UNICODE_UTF8
2499 bool wxMBConv_iconv::IsUTF8() const
2501 return wxStricmp(m_name
, "UTF-8") == 0 ||
2502 wxStricmp(m_name
, "UTF8") == 0;
2506 #endif // HAVE_ICONV
2509 // ============================================================================
2510 // Win32 conversion classes
2511 // ============================================================================
2513 #ifdef wxHAVE_WIN32_MB2WC
2517 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2518 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2521 class wxMBConv_win32
: public wxMBConv
2526 m_CodePage
= CP_ACP
;
2527 m_minMBCharWidth
= 0;
2530 wxMBConv_win32(const wxMBConv_win32
& conv
)
2533 m_CodePage
= conv
.m_CodePage
;
2534 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2538 wxMBConv_win32(const char* name
)
2540 m_CodePage
= wxCharsetToCodepage(name
);
2541 m_minMBCharWidth
= 0;
2544 wxMBConv_win32(wxFontEncoding encoding
)
2546 m_CodePage
= wxEncodingToCodepage(encoding
);
2547 m_minMBCharWidth
= 0;
2549 #endif // wxUSE_FONTMAP
2551 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2553 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2554 // the behaviour is not compatible with the Unix version (using iconv)
2555 // and break the library itself, e.g. wxTextInputStream::NextChar()
2556 // wouldn't work if reading an incomplete MB char didn't result in an
2559 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2560 // Win XP or newer and it is not supported for UTF-[78] so we always
2561 // use our own conversions in this case. See
2562 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2563 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2564 if ( m_CodePage
== CP_UTF8
)
2566 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2569 if ( m_CodePage
== CP_UTF7
)
2571 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2575 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2576 IsAtLeastWin2kSP4() )
2578 flags
= MB_ERR_INVALID_CHARS
;
2581 const size_t len
= ::MultiByteToWideChar
2583 m_CodePage
, // code page
2584 flags
, // flags: fall on error
2585 psz
, // input string
2586 -1, // its length (NUL-terminated)
2587 buf
, // output string
2588 buf
? n
: 0 // size of output buffer
2592 // function totally failed
2593 return wxCONV_FAILED
;
2596 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2597 // check if we succeeded, by doing a double trip:
2598 if ( !flags
&& buf
)
2600 const size_t mbLen
= strlen(psz
);
2601 wxCharBuffer
mbBuf(mbLen
);
2602 if ( ::WideCharToMultiByte
2609 mbLen
+ 1, // size in bytes, not length
2613 strcmp(mbBuf
, psz
) != 0 )
2615 // we didn't obtain the same thing we started from, hence
2616 // the conversion was lossy and we consider that it failed
2617 return wxCONV_FAILED
;
2621 // note that it returns count of written chars for buf != NULL and size
2622 // of the needed buffer for buf == NULL so in either case the length of
2623 // the string (which never includes the terminating NUL) is one less
2627 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2630 we have a problem here: by default, WideCharToMultiByte() may
2631 replace characters unrepresentable in the target code page with bad
2632 quality approximations such as turning "1/2" symbol (U+00BD) into
2633 "1" for the code pages which don't have it and we, obviously, want
2634 to avoid this at any price
2636 the trouble is that this function does it _silently_, i.e. it won't
2637 even tell us whether it did or not... Win98/2000 and higher provide
2638 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2639 we have to resort to a round trip, i.e. check that converting back
2640 results in the same string -- this is, of course, expensive but
2641 otherwise we simply can't be sure to not garble the data.
2644 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2645 // it doesn't work with CJK encodings (which we test for rather roughly
2646 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2648 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2651 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2653 // it's our lucky day
2654 flags
= WC_NO_BEST_FIT_CHARS
;
2655 pUsedDef
= &usedDef
;
2657 else // old system or unsupported encoding
2663 const size_t len
= ::WideCharToMultiByte
2665 m_CodePage
, // code page
2666 flags
, // either none or no best fit
2667 pwz
, // input string
2668 -1, // it is (wide) NUL-terminated
2669 buf
, // output buffer
2670 buf
? n
: 0, // and its size
2671 NULL
, // default "replacement" char
2672 pUsedDef
// [out] was it used?
2677 // function totally failed
2678 return wxCONV_FAILED
;
2681 // we did something, check if we really succeeded
2684 // check if the conversion failed, i.e. if any replacements
2687 return wxCONV_FAILED
;
2689 else // we must resort to double tripping...
2691 // first we need to ensure that we really have the MB data: this is
2692 // not the case if we're called with NULL buffer, in which case we
2693 // need to do the conversion yet again
2694 wxCharBuffer bufDef
;
2697 bufDef
= wxCharBuffer(len
);
2698 buf
= bufDef
.data();
2699 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2700 buf
, len
, NULL
, NULL
) )
2701 return wxCONV_FAILED
;
2706 wxWCharBuffer
wcBuf(n
);
2707 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2708 wcscmp(wcBuf
, pwz
) != 0 )
2710 // we didn't obtain the same thing we started from, hence
2711 // the conversion was lossy and we consider that it failed
2712 return wxCONV_FAILED
;
2716 // see the comment above for the reason of "len - 1"
2720 virtual size_t GetMBNulLen() const
2722 if ( m_minMBCharWidth
== 0 )
2724 int len
= ::WideCharToMultiByte
2726 m_CodePage
, // code page
2728 L
"", // input string
2729 1, // translate just the NUL
2730 NULL
, // output buffer
2732 NULL
, // no replacement char
2733 NULL
// [out] don't care if it was used
2736 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2740 wxLogDebug(wxT("Unexpected NUL length %d"), len
);
2741 self
->m_minMBCharWidth
= (size_t)-1;
2745 self
->m_minMBCharWidth
= (size_t)-1;
2751 self
->m_minMBCharWidth
= len
;
2756 return m_minMBCharWidth
;
2759 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2761 bool IsOk() const { return m_CodePage
!= -1; }
2764 static bool CanUseNoBestFit()
2766 static int s_isWin98Or2k
= -1;
2768 if ( s_isWin98Or2k
== -1 )
2771 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2773 case wxOS_WINDOWS_9X
:
2774 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2777 case wxOS_WINDOWS_NT
:
2778 s_isWin98Or2k
= verMaj
>= 5;
2782 // unknown: be conservative by default
2787 wxASSERT_MSG( s_isWin98Or2k
!= -1, wxT("should be set above") );
2790 return s_isWin98Or2k
== 1;
2793 static bool IsAtLeastWin2kSP4()
2798 static int s_isAtLeastWin2kSP4
= -1;
2800 if ( s_isAtLeastWin2kSP4
== -1 )
2802 OSVERSIONINFOEX ver
;
2804 memset(&ver
, 0, sizeof(ver
));
2805 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2806 GetVersionEx((OSVERSIONINFO
*)&ver
);
2808 s_isAtLeastWin2kSP4
=
2809 ((ver
.dwMajorVersion
> 5) || // Vista+
2810 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2811 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2812 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2816 return s_isAtLeastWin2kSP4
== 1;
2821 // the code page we're working with
2824 // cached result of GetMBNulLen(), set to 0 initially meaning
2826 size_t m_minMBCharWidth
;
2829 #endif // wxHAVE_WIN32_MB2WC
2832 // ============================================================================
2833 // wxEncodingConverter based conversion classes
2834 // ============================================================================
2838 class wxMBConv_wxwin
: public wxMBConv
2843 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2844 // The wxMBConv_cf class does a better job.
2845 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2846 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2847 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2851 // temporarily just use wxEncodingConverter stuff,
2852 // so that it works while a better implementation is built
2853 wxMBConv_wxwin(const char* name
)
2856 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2858 m_enc
= wxFONTENCODING_SYSTEM
;
2863 wxMBConv_wxwin(wxFontEncoding enc
)
2870 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2872 size_t inbuf
= strlen(psz
);
2875 if (!m2w
.Convert(psz
, buf
))
2876 return wxCONV_FAILED
;
2881 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2883 const size_t inbuf
= wxWcslen(psz
);
2886 if (!w2m
.Convert(psz
, buf
))
2887 return wxCONV_FAILED
;
2893 virtual size_t GetMBNulLen() const
2897 case wxFONTENCODING_UTF16BE
:
2898 case wxFONTENCODING_UTF16LE
:
2901 case wxFONTENCODING_UTF32BE
:
2902 case wxFONTENCODING_UTF32LE
:
2910 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2912 bool IsOk() const { return m_ok
; }
2915 wxFontEncoding m_enc
;
2916 wxEncodingConverter m2w
, w2m
;
2919 // were we initialized successfully?
2922 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin
);
2925 // make the constructors available for unit testing
2926 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2928 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2929 if ( !result
->IsOk() )
2938 #endif // wxUSE_FONTMAP
2940 // ============================================================================
2941 // wxCSConv implementation
2942 // ============================================================================
2944 void wxCSConv::Init()
2950 void wxCSConv::SetEncoding(wxFontEncoding encoding
)
2954 case wxFONTENCODING_MAX
:
2955 case wxFONTENCODING_SYSTEM
:
2958 // It's ok to not have encoding value if we have a name for it.
2959 m_encoding
= wxFONTENCODING_SYSTEM
;
2961 else // No name neither.
2963 // Fall back to the system default encoding in this case (not
2964 // sure how much sense does this make but this is how the old
2965 // code used to behave).
2967 m_encoding
= wxLocale::GetSystemEncoding();
2968 if ( m_encoding
== wxFONTENCODING_SYSTEM
)
2969 #endif // wxUSE_INTL
2970 m_encoding
= wxFONTENCODING_ISO8859_1
;
2974 case wxFONTENCODING_DEFAULT
:
2975 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2976 m_encoding
= wxFONTENCODING_ISO8859_1
;
2980 // Just use the provided encoding.
2981 m_encoding
= encoding
;
2985 wxCSConv::wxCSConv(const wxString
& charset
)
2989 if ( !charset
.empty() )
2991 SetName(charset
.ToAscii());
2995 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset
));
2997 SetEncoding(wxFONTENCODING_SYSTEM
);
3000 m_convReal
= DoCreate();
3003 wxCSConv::wxCSConv(wxFontEncoding encoding
)
3005 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
3007 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
3009 encoding
= wxFONTENCODING_SYSTEM
;
3014 SetEncoding(encoding
);
3016 m_convReal
= DoCreate();
3019 wxCSConv::~wxCSConv()
3024 wxCSConv::wxCSConv(const wxCSConv
& conv
)
3029 SetName(conv
.m_name
);
3030 SetEncoding(conv
.m_encoding
);
3032 m_convReal
= DoCreate();
3035 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
3039 SetName(conv
.m_name
);
3040 SetEncoding(conv
.m_encoding
);
3042 m_convReal
= DoCreate();
3047 void wxCSConv::Clear()
3052 wxDELETE(m_convReal
);
3055 void wxCSConv::SetName(const char *charset
)
3058 m_name
= wxStrdup(charset
);
3063 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3064 wxEncodingNameCache
);
3066 static wxEncodingNameCache gs_nameCache
;
3069 wxMBConv
*wxCSConv::DoCreate() const
3072 wxLogTrace(TRACE_STRCONV
,
3073 wxT("creating conversion for %s"),
3075 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
3076 #endif // wxUSE_FONTMAP
3078 // check for the special case of ASCII or ISO8859-1 charset: as we have
3079 // special knowledge of it anyhow, we don't need to create a special
3080 // conversion object
3081 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3083 // don't convert at all
3087 // we trust OS to do conversion better than we can so try external
3088 // conversion methods first
3090 // the full order is:
3091 // 1. OS conversion (iconv() under Unix or Win32 API)
3092 // 2. hard coded conversions for UTF
3093 // 3. wxEncodingConverter as fall back
3099 #endif // !wxUSE_FONTMAP
3102 wxFontEncoding
encoding(m_encoding
);
3107 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
3115 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3116 #endif // wxUSE_FONTMAP
3120 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3121 if ( it
!= gs_nameCache
.end() )
3123 if ( it
->second
.empty() )
3126 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
3133 const wxChar
* const* names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3134 // CS : in case this does not return valid names (eg for MacRoman)
3135 // encoding got a 'failure' entry in the cache all the same,
3136 // although it just has to be created using a different method, so
3137 // only store failed iconv creation attempts (or perhaps we
3138 // shoulnd't do this at all ?)
3139 if ( names
[0] != NULL
)
3141 for ( ; *names
; ++names
)
3143 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3144 // will need changes that will obsolete this
3145 wxString
name(*names
);
3146 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
3149 gs_nameCache
[encoding
] = *names
;
3156 gs_nameCache
[encoding
] = wxT(""); // cache the failure
3159 #endif // wxUSE_FONTMAP
3161 #endif // HAVE_ICONV
3163 #ifdef wxHAVE_WIN32_MB2WC
3166 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3167 : new wxMBConv_win32(m_encoding
);
3176 #endif // wxHAVE_WIN32_MB2WC
3180 // leave UTF16 and UTF32 to the built-ins of wx
3181 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3182 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3185 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3186 : new wxMBConv_cf(m_encoding
);
3188 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3197 #endif // __DARWIN__
3200 wxFontEncoding enc
= m_encoding
;
3202 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3204 // use "false" to suppress interactive dialogs -- we can be called from
3205 // anywhere and popping up a dialog from here is the last thing we want to
3207 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3209 #endif // wxUSE_FONTMAP
3213 case wxFONTENCODING_UTF7
:
3214 return new wxMBConvUTF7
;
3216 case wxFONTENCODING_UTF8
:
3217 return new wxMBConvUTF8
;
3219 case wxFONTENCODING_UTF16BE
:
3220 return new wxMBConvUTF16BE
;
3222 case wxFONTENCODING_UTF16LE
:
3223 return new wxMBConvUTF16LE
;
3225 case wxFONTENCODING_UTF32BE
:
3226 return new wxMBConvUTF32BE
;
3228 case wxFONTENCODING_UTF32LE
:
3229 return new wxMBConvUTF32LE
;
3232 // nothing to do but put here to suppress gcc warnings
3239 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3240 : new wxMBConv_wxwin(m_encoding
);
3247 wxLogTrace(TRACE_STRCONV
,
3248 wxT("encoding \"%s\" is not supported by this system"),
3249 (m_name
? wxString(m_name
)
3250 : wxFontMapperBase::GetEncodingName(m_encoding
)));
3251 #endif // wxUSE_FONTMAP
3256 bool wxCSConv::IsOk() const
3258 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3259 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3260 return true; // always ok as we do it ourselves
3262 // m_convReal->IsOk() is called at its own creation, so we know it must
3263 // be ok if m_convReal is non-NULL
3264 return m_convReal
!= NULL
;
3267 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3268 const char *src
, size_t srcLen
) const
3271 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3274 if ( srcLen
== wxNO_LEN
)
3275 srcLen
= strlen(src
) + 1; // take trailing NUL too
3279 if ( dstLen
< srcLen
)
3280 return wxCONV_FAILED
;
3282 for ( size_t n
= 0; n
< srcLen
; n
++ )
3283 dst
[n
] = (unsigned char)(src
[n
]);
3289 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3290 const wchar_t *src
, size_t srcLen
) const
3293 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3296 if ( srcLen
== wxNO_LEN
)
3297 srcLen
= wxWcslen(src
) + 1;
3301 if ( dstLen
< srcLen
)
3302 return wxCONV_FAILED
;
3304 for ( size_t n
= 0; n
< srcLen
; n
++ )
3306 if ( src
[n
] > 0xFF )
3307 return wxCONV_FAILED
;
3309 dst
[n
] = (char)src
[n
];
3313 else // still need to check the input validity
3315 for ( size_t n
= 0; n
< srcLen
; n
++ )
3317 if ( src
[n
] > 0xFF )
3318 return wxCONV_FAILED
;
3325 size_t wxCSConv::GetMBNulLen() const
3328 return m_convReal
->GetMBNulLen();
3330 // otherwise, we are ISO-8859-1
3334 #if wxUSE_UNICODE_UTF8
3335 bool wxCSConv::IsUTF8() const
3338 return m_convReal
->IsUTF8();
3340 // otherwise, we are ISO-8859-1
3348 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3351 return wxWCharBuffer();
3353 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3355 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3357 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3362 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3365 return wxCharBuffer();
3367 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3369 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3374 #endif // wxUSE_UNICODE
3376 // ----------------------------------------------------------------------------
3378 // ----------------------------------------------------------------------------
3380 // NB: The reason why we create converted objects in this convoluted way,
3381 // using a factory function instead of global variable, is that they
3382 // may be used at static initialization time (some of them are used by
3383 // wxString ctors and there may be a global wxString object). In other
3384 // words, possibly _before_ the converter global object would be
3391 #undef wxConvISO8859_1
3393 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3394 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3395 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3397 static impl_klass name##Obj ctor_args; \
3398 return &name##Obj; \
3400 /* this ensures that all global converter objects are created */ \
3401 /* by the time static initialization is done, i.e. before any */ \
3402 /* thread is launched: */ \
3403 static klass* gs_##name##instance = wxGet_##name##Ptr()
3405 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3406 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3409 // disable warning "variable 'xxx' was declared but never referenced"
3410 #pragma warning(disable: 177)
3414 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3415 #elif 0 // defined(__WXOSX__)
3416 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_cf
, wxConvLibc
, (wxFONTENCODING_UTF8
));
3418 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3421 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3422 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3423 // provokes an error message about "not enough macro parameters"; and we
3424 // can't use "()" here as the name##Obj declaration would be parsed as a
3425 // function declaration then, so use a semicolon and live with an extra
3426 // empty statement (and hope that no compilers warns about this)
3427 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3428 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3430 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3431 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3433 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3434 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3437 // It is important to use this conversion object under Darwin as it ensures
3438 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3439 // decomposed form internally (at least for the file names).
3440 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3443 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3446 #else // !__DARWIN__
3447 wxGet_wxConvLibcPtr();
3448 #endif // __DARWIN__/!__DARWIN__