1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
8 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
9 // (c) 2000-2003 Vadim Zeitlin
10 // (c) 2004 Ryan Norton, Fredrik Roubert
11 // Licence: wxWindows licence
12 /////////////////////////////////////////////////////////////////////////////
14 // For compilers that support precompilation, includes "wx.h".
15 #include "wx/wxprec.h"
25 #include "wx/hashmap.h"
28 #include "wx/strconv.h"
38 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
39 #include "wx/msw/private.h"
40 #include "wx/msw/missing.h"
41 #define wxHAVE_WIN32_MB2WC
46 #include "wx/thread.h"
49 #include "wx/encconv.h"
50 #include "wx/fontmap.h"
53 #include "wx/osx/core/private/strconv_cf.h"
54 #endif //def __DARWIN__
57 #define TRACE_STRCONV wxT("strconv")
59 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
61 #if SIZEOF_WCHAR_T == 2
66 // ============================================================================
68 // ============================================================================
70 // helper function of cMB2WC(): check if n bytes at this location are all NUL
71 static bool NotAllNULs(const char *p
, size_t n
)
73 while ( n
&& *p
++ == '\0' )
79 // ----------------------------------------------------------------------------
80 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
81 // ----------------------------------------------------------------------------
83 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
88 *output
= (wxUint16
) input
;
92 else if (input
>= 0x110000)
100 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
101 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
108 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
110 if ((*input
< 0xd800) || (*input
> 0xdfff))
115 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
118 return wxCONV_FAILED
;
122 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
128 typedef wchar_t wxDecodeSurrogate_t
;
130 typedef wxUint16 wxDecodeSurrogate_t
;
131 #endif // WC_UTF16/!WC_UTF16
133 // returns the next UTF-32 character from the wchar_t buffer and advances the
134 // pointer to the character after this one
136 // if an invalid character is found, *pSrc is set to NULL, the caller must
138 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
142 n
= decode_utf16(reinterpret_cast<const wxUint16
*>(*pSrc
), out
);
143 if ( n
== wxCONV_FAILED
)
151 // ----------------------------------------------------------------------------
153 // ----------------------------------------------------------------------------
156 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
157 const char *src
, size_t srcLen
) const
159 // although new conversion classes are supposed to implement this function
160 // directly, the existing ones only implement the old MB2WC() and so, to
161 // avoid to have to rewrite all conversion classes at once, we provide a
162 // default (but not efficient) implementation of this one in terms of the
163 // old function by copying the input to ensure that it's NUL-terminated and
164 // then using MB2WC() to convert it
166 // moreover, some conversion classes simply can't implement ToWChar()
167 // directly, the primary example is wxConvLibc: mbstowcs() only handles
168 // NUL-terminated strings
170 // the number of chars [which would be] written to dst [if it were not NULL]
171 size_t dstWritten
= 0;
173 // the number of NULs terminating this string
174 size_t nulLen
= 0; // not really needed, but just to avoid warnings
176 // if we were not given the input size we just have to assume that the
177 // string is properly terminated as we have no way of knowing how long it
178 // is anyhow, but if we do have the size check whether there are enough
182 if ( srcLen
!= wxNO_LEN
)
184 // we need to know how to find the end of this string
185 nulLen
= GetMBNulLen();
186 if ( nulLen
== wxCONV_FAILED
)
187 return wxCONV_FAILED
;
189 // if there are enough NULs we can avoid the copy
190 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
192 // make a copy in order to properly NUL-terminate the string
193 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
194 char * const p
= bufTmp
.data();
195 memcpy(p
, src
, srcLen
);
196 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
202 srcEnd
= src
+ srcLen
;
204 else // quit after the first loop iteration
209 // the idea of this code is straightforward: it converts a NUL-terminated
210 // chunk of the string during each iteration and updates the output buffer
213 // all the complication come from the fact that this function, for
214 // historical reasons, must behave in 2 subtly different ways when it's
215 // called with a fixed number of characters and when it's called for the
216 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
217 // must count all characters we convert, NUL or not; but in the latter we
218 // do not count the trailing NUL -- but still count all the NULs inside the
221 // so for the (simple) former case we just always count the trailing NUL,
222 // but for the latter we need to wait until we see if there is going to be
223 // another loop iteration and only count it then
226 // try to convert the current chunk
227 size_t lenChunk
= MB2WC(NULL
, src
, 0);
228 if ( lenChunk
== wxCONV_FAILED
)
229 return wxCONV_FAILED
;
231 dstWritten
+= lenChunk
;
237 // nothing left in the input string, conversion succeeded
243 if ( dstWritten
> dstLen
)
244 return wxCONV_FAILED
;
246 // +1 is for trailing NUL
247 if ( MB2WC(dst
, src
, lenChunk
+ 1) == wxCONV_FAILED
)
248 return wxCONV_FAILED
;
257 // we convert just one chunk in this case as this is the entire
258 // string anyhow (and we don't count the trailing NUL in this case)
262 // advance the input pointer past the end of this chunk: notice that we
263 // will always stop before srcEnd because we know that the chunk is
264 // always properly NUL-terminated
265 while ( NotAllNULs(src
, nulLen
) )
267 // notice that we must skip over multiple bytes here as we suppose
268 // that if NUL takes 2 or 4 bytes, then all the other characters do
269 // too and so if advanced by a single byte we might erroneously
270 // detect sequences of NUL bytes in the middle of the input
274 // if the buffer ends before this NUL, we shouldn't count it in our
275 // output so skip the code below
279 // do count this terminator as it's inside the buffer we convert
284 src
+= nulLen
; // skip the terminator itself
294 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
295 const wchar_t *src
, size_t srcLen
) const
297 // the number of chars [which would be] written to dst [if it were not NULL]
298 size_t dstWritten
= 0;
300 // if we don't know its length we have no choice but to assume that it is
301 // NUL-terminated (notice that it can still be NUL-terminated even if
302 // explicit length is given but it doesn't change our return value)
303 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
305 // make a copy of the input string unless it is already properly
307 wxWCharBuffer bufTmp
;
308 if ( isNulTerminated
)
310 srcLen
= wxWcslen(src
) + 1;
312 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
314 // make a copy in order to properly NUL-terminate the string
315 bufTmp
= wxWCharBuffer(srcLen
);
316 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
320 const size_t lenNul
= GetMBNulLen();
321 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
323 src
++ /* skip L'\0' too */ )
325 // try to convert the current chunk
326 size_t lenChunk
= WC2MB(NULL
, src
, 0);
327 if ( lenChunk
== wxCONV_FAILED
)
328 return wxCONV_FAILED
;
330 dstWritten
+= lenChunk
;
332 const wchar_t * const
333 chunkEnd
= isNulTerminated
? srcEnd
- 1 : src
+ wxWcslen(src
);
335 // our return value accounts for the trailing NUL(s), unlike that of
336 // WC2MB(), however don't do it for the last NUL we artificially added
338 if ( chunkEnd
< srcEnd
)
339 dstWritten
+= lenNul
;
343 if ( dstWritten
> dstLen
)
344 return wxCONV_FAILED
;
346 // if we know that there is enough space in the destination buffer
347 // (because we accounted for lenNul in dstWritten above), we can
348 // convert directly in place -- but otherwise we need another
349 // temporary buffer to ensure that we don't overwrite the output
352 if ( chunkEnd
== srcEnd
)
354 dstBuf
= wxCharBuffer(lenChunk
+ lenNul
- 1);
355 dstTmp
= dstBuf
.data();
362 if ( WC2MB(dstTmp
, src
, lenChunk
+ lenNul
) == wxCONV_FAILED
)
363 return wxCONV_FAILED
;
367 // copy everything up to but excluding the terminating NUL(s)
368 // into the real output buffer
369 memcpy(dst
, dstTmp
, lenChunk
);
371 // micro-optimization: if dstTmp != dst it means that chunkEnd
372 // == srcEnd and so we're done, no need to update anything below
377 if ( chunkEnd
< srcEnd
)
387 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
389 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
390 if ( rc
!= wxCONV_FAILED
)
392 // ToWChar() returns the buffer length, i.e. including the trailing
393 // NUL, while this method doesn't take it into account
400 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
402 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
403 if ( rc
!= wxCONV_FAILED
)
411 wxMBConv::~wxMBConv()
413 // nothing to do here (necessary for Darwin linking probably)
416 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
420 // calculate the length of the buffer needed first
421 const size_t nLen
= ToWChar(NULL
, 0, psz
);
422 if ( nLen
!= wxCONV_FAILED
)
424 // now do the actual conversion
425 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
427 // +1 for the trailing NULL
428 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
433 return wxWCharBuffer();
436 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
440 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
441 if ( nLen
!= wxCONV_FAILED
)
443 wxCharBuffer
buf(nLen
- 1);
444 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
449 return wxCharBuffer();
453 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
455 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
456 if ( dstLen
!= wxCONV_FAILED
)
458 // notice that we allocate space for dstLen+1 wide characters here
459 // because we want the buffer to always be NUL-terminated, even if the
460 // input isn't (as otherwise the caller has no way to know its length)
461 wxWCharBuffer
wbuf(dstLen
);
462 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
468 // we also need to handle NUL-terminated input strings
469 // specially: for them the output is the length of the string
470 // excluding the trailing NUL, however if we're asked to
471 // convert a specific number of characters we return the length
472 // of the resulting output even if it's NUL-terminated
473 if ( inLen
== wxNO_LEN
)
484 return wxWCharBuffer();
488 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
490 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
491 if ( dstLen
!= wxCONV_FAILED
)
493 const size_t nulLen
= GetMBNulLen();
495 // as above, ensure that the buffer is always NUL-terminated, even if
497 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
498 memset(buf
.data() + dstLen
, 0, nulLen
);
499 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
505 if ( inLen
== wxNO_LEN
)
507 // in this case both input and output are NUL-terminated
508 // and we're not supposed to count NUL
520 return wxCharBuffer();
523 const wxWCharBuffer
wxMBConv::cMB2WC(const wxScopedCharBuffer
& buf
) const
525 const size_t srcLen
= buf
.length();
528 const size_t dstLen
= ToWChar(NULL
, 0, buf
, srcLen
);
529 if ( dstLen
!= wxCONV_FAILED
)
531 wxWCharBuffer
wbuf(dstLen
);
532 wbuf
.data()[dstLen
] = L
'\0';
533 if ( ToWChar(wbuf
.data(), dstLen
, buf
, srcLen
) != wxCONV_FAILED
)
538 return wxScopedWCharBuffer::CreateNonOwned(L
"", 0);
541 const wxCharBuffer
wxMBConv::cWC2MB(const wxScopedWCharBuffer
& wbuf
) const
543 const size_t srcLen
= wbuf
.length();
546 const size_t dstLen
= FromWChar(NULL
, 0, wbuf
, srcLen
);
547 if ( dstLen
!= wxCONV_FAILED
)
549 wxCharBuffer
buf(dstLen
);
550 buf
.data()[dstLen
] = '\0';
551 if ( FromWChar(buf
.data(), dstLen
, wbuf
, srcLen
) != wxCONV_FAILED
)
556 return wxScopedCharBuffer::CreateNonOwned("", 0);
559 // ----------------------------------------------------------------------------
561 // ----------------------------------------------------------------------------
563 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
565 return wxMB2WC(buf
, psz
, n
);
568 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
570 return wxWC2MB(buf
, psz
, n
);
573 // ----------------------------------------------------------------------------
574 // wxConvBrokenFileNames
575 // ----------------------------------------------------------------------------
579 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
581 if ( wxStricmp(charset
, wxT("UTF-8")) == 0 ||
582 wxStricmp(charset
, wxT("UTF8")) == 0 )
583 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
585 m_conv
= new wxCSConv(charset
);
590 // ----------------------------------------------------------------------------
592 // ----------------------------------------------------------------------------
594 // Implementation (C) 2004 Fredrik Roubert
596 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
599 // BASE64 decoding table
601 static const unsigned char utf7unb64
[] =
603 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
604 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
605 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
606 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
609 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
610 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
611 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
612 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
613 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
614 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
615 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
616 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
617 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
618 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
619 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
620 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
621 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
622 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
637 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
638 const char *src
, size_t srcLen
) const
640 DecoderState stateOrig
,
642 if ( srcLen
== wxNO_LEN
)
644 // convert the entire string, up to and including the trailing NUL
645 srcLen
= strlen(src
) + 1;
647 // when working on the entire strings we don't update nor use the shift
648 // state from the previous call
649 statePtr
= &stateOrig
;
651 else // when working with partial strings we do use the shift state
653 statePtr
= const_cast<DecoderState
*>(&m_stateDecoder
);
655 // also save the old state to be able to rollback to it on error
656 stateOrig
= m_stateDecoder
;
659 // but to simplify the code below we use this variable in both cases
660 DecoderState
& state
= *statePtr
;
663 // number of characters [which would have been] written to dst [if it were
667 const char * const srcEnd
= src
+ srcLen
;
669 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
671 const unsigned char cc
= *src
++;
673 if ( state
.IsShifted() )
675 const unsigned char dc
= utf7unb64
[cc
];
678 // end of encoded part, check that nothing was left: there can
679 // be up to 4 bits of 0 padding but nothing else (we also need
680 // to check isLSB as we count bits modulo 8 while a valid UTF-7
681 // encoded sequence must contain an integral number of UTF-16
683 if ( state
.isLSB
|| state
.bit
> 4 ||
684 (state
.accum
& ((1 << state
.bit
) - 1)) )
689 return wxCONV_FAILED
;
694 // re-parse this character normally below unless it's '-' which
695 // is consumed by the decoder
699 else // valid encoded character
701 // mini base64 decoder: each character is 6 bits
706 if ( state
.bit
>= 8 )
708 // got the full byte, consume it
710 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
714 // we've got the full word, output it
716 *dst
++ = (state
.msb
<< 8) | b
;
722 // just store it while we wait for LSB
730 if ( state
.IsDirect() )
732 // start of an encoded segment?
737 // just the encoded plus sign, don't switch to shifted mode
743 else if ( utf7unb64
[(unsigned)*src
] == 0xff )
745 // empty encoded chunks are not allowed
749 return wxCONV_FAILED
;
751 else // base-64 encoded chunk follows
758 // only printable 7 bit ASCII characters (with the exception of
759 // NUL, TAB, CR and LF) can be used directly
760 if ( cc
>= 0x7f || (cc
< ' ' &&
761 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
762 return wxCONV_FAILED
;
773 // as we didn't read any characters we should be called with the same
774 // data (followed by some more new data) again later so don't save our
778 return wxCONV_FAILED
;
785 // BASE64 encoding table
787 static const unsigned char utf7enb64
[] =
789 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
790 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
791 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
792 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
793 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
794 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
795 'w', 'x', 'y', 'z', '0', '1', '2', '3',
796 '4', '5', '6', '7', '8', '9', '+', '/'
800 // UTF-7 encoding table
802 // 0 - Set D (directly encoded characters)
803 // 1 - Set O (optional direct characters)
804 // 2 - whitespace characters (optional)
805 // 3 - special characters
807 static const unsigned char utf7encode
[128] =
809 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
810 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
811 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
812 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
813 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
819 static inline bool wxIsUTF7Direct(wchar_t wc
)
821 return wc
< 0x80 && utf7encode
[wc
] < 1;
824 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
825 const wchar_t *src
, size_t srcLen
) const
827 EncoderState stateOrig
,
829 if ( srcLen
== wxNO_LEN
)
831 // we don't apply the stored state when operating on entire strings at
833 statePtr
= &stateOrig
;
835 srcLen
= wxWcslen(src
) + 1;
837 else // do use the mode we left the output in previously
839 stateOrig
= m_stateEncoder
;
840 statePtr
= const_cast<EncoderState
*>(&m_stateEncoder
);
843 EncoderState
& state
= *statePtr
;
848 const wchar_t * const srcEnd
= src
+ srcLen
;
849 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
852 if ( wxIsUTF7Direct(cc
) )
854 if ( state
.IsShifted() )
856 // pad with zeros the last encoded block if necessary
860 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
875 else if ( cc
== '+' && state
.IsDirect() )
886 else if (((wxUint32
)cc
) > 0xffff)
888 // no surrogate pair generation (yet?)
889 return wxCONV_FAILED
;
894 if ( state
.IsDirect() )
903 // BASE64 encode string
906 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
909 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
911 for (state
.bit
+= 8; state
.bit
>= 6; )
915 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
920 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
928 // we need to restore the original encoder state if we were called just to
929 // calculate the amount of space needed as we will presumably be called
930 // again to really convert the data now
937 // ----------------------------------------------------------------------------
939 // ----------------------------------------------------------------------------
941 static const wxUint32 utf8_max
[]=
942 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
944 // boundaries of the private use area we use to (temporarily) remap invalid
945 // characters invalid in a UTF-8 encoded string
946 const wxUint32 wxUnicodePUA
= 0x100000;
947 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
949 // this table gives the length of the UTF-8 encoding from its first character:
950 const unsigned char tableUtf8Lengths
[256] = {
951 // single-byte sequences (ASCII):
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
961 // these are invalid:
962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
968 // two-byte sequences:
969 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
970 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
972 // three-byte sequences:
973 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
975 // four-byte sequences:
976 4, 4, 4, 4, 4, // F0..F4
978 // these are invalid again (5- or 6-byte
979 // sequences and sequences for code points
980 // above U+10FFFF, as restricted by RFC 3629):
981 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
985 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
986 const char *src
, size_t srcLen
) const
988 wchar_t *out
= dstLen
? dst
: NULL
;
991 if ( srcLen
== wxNO_LEN
)
992 srcLen
= strlen(src
) + 1;
994 for ( const char *p
= src
; ; p
++ )
996 if ( (srcLen
== wxNO_LEN
? !*p
: !srcLen
) )
998 // all done successfully, just add the trailing NULL if we are not
999 // using explicit length
1000 if ( srcLen
== wxNO_LEN
)
1016 if ( out
&& !dstLen
-- )
1020 unsigned char c
= *p
;
1024 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
1027 if ( srcLen
!= wxNO_LEN
)
1034 unsigned len
= tableUtf8Lengths
[c
];
1038 if ( srcLen
< len
) // the test works for wxNO_LEN too
1041 if ( srcLen
!= wxNO_LEN
)
1044 // Char. number range | UTF-8 octet sequence
1045 // (hexadecimal) | (binary)
1046 // ----------------------+----------------------------------------
1047 // 0000 0000 - 0000 007F | 0xxxxxxx
1048 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1049 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1050 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1052 // Code point value is stored in bits marked with 'x',
1053 // lowest-order bit of the value on the right side in the diagram
1054 // above. (from RFC 3629)
1056 // mask to extract lead byte's value ('x' bits above), by sequence
1058 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1060 // mask and value of lead byte's most significant bits, by length:
1061 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1062 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1064 len
--; // it's more convenient to work with 0-based length here
1066 // extract the lead byte's value bits:
1067 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
1070 code
= c
& leadValueMask
[len
];
1072 // all remaining bytes, if any, are handled in the same way
1073 // regardless of sequence's length:
1074 for ( ; len
; --len
)
1077 if ( (c
& 0xC0) != 0x80 )
1078 return wxCONV_FAILED
;
1086 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1087 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
1096 #endif // WC_UTF16/!WC_UTF16
1104 return wxCONV_FAILED
;
1108 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
1109 const wchar_t *src
, size_t srcLen
) const
1111 char *out
= dstLen
? dst
: NULL
;
1114 for ( const wchar_t *wp
= src
; ; wp
++ )
1116 if ( (srcLen
== wxNO_LEN
? !*wp
: !srcLen
) )
1118 // all done successfully, just add the trailing NULL if we are not
1119 // using explicit length
1120 if ( srcLen
== wxNO_LEN
)
1136 if ( srcLen
!= wxNO_LEN
)
1141 // cast is ok for WC_UTF16
1142 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1144 // skip the next char too as we decoded a surrogate
1146 if ( srcLen
!= wxNO_LEN
)
1149 #else // wchar_t is UTF-32
1150 code
= *wp
& 0x7fffffff;
1162 out
[0] = (char)code
;
1165 else if ( code
<= 0x07FF )
1173 // NB: this line takes 6 least significant bits, encodes them as
1174 // 10xxxxxx and discards them so that the next byte can be encoded:
1175 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1176 out
[0] = 0xC0 | code
;
1179 else if ( code
< 0xFFFF )
1187 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1188 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1189 out
[0] = 0xE0 | code
;
1192 else if ( code
<= 0x10FFFF )
1200 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1201 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1202 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1203 out
[0] = 0xF0 | code
;
1208 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1221 // we only get here if an error occurs during decoding
1222 return wxCONV_FAILED
;
1225 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1226 const char *psz
, size_t srcLen
) const
1228 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1229 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1233 // The length can be either given explicitly or computed implicitly for the
1234 // NUL-terminated strings.
1235 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
1236 while ((isNulTerminated
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1238 const char *opsz
= psz
;
1239 bool invalid
= false;
1240 unsigned char cc
= *psz
++, fc
= cc
;
1242 for (cnt
= 0; fc
& 0x80; cnt
++)
1252 // escape the escape character for octal escapes
1253 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1254 && cc
== '\\' && (!buf
|| len
< n
))
1266 // invalid UTF-8 sequence
1271 unsigned ocnt
= cnt
- 1;
1272 wxUint32 res
= cc
& (0x3f >> cnt
);
1276 if ((cc
& 0xC0) != 0x80)
1278 // invalid UTF-8 sequence
1284 res
= (res
<< 6) | (cc
& 0x3f);
1287 if (invalid
|| res
<= utf8_max
[ocnt
])
1289 // illegal UTF-8 encoding
1292 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1293 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1295 // if one of our PUA characters turns up externally
1296 // it must also be treated as an illegal sequence
1297 // (a bit like you have to escape an escape character)
1303 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1304 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1305 if (pa
== wxCONV_FAILED
)
1317 *buf
++ = (wchar_t)res
;
1319 #endif // WC_UTF16/!WC_UTF16
1325 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1327 while (opsz
< psz
&& (!buf
|| len
< n
))
1330 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1331 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1332 wxASSERT(pa
!= wxCONV_FAILED
);
1339 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1345 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1347 while (opsz
< psz
&& (!buf
|| len
< n
))
1349 if ( buf
&& len
+ 3 < n
)
1351 unsigned char on
= *opsz
;
1353 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1354 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1355 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1362 else // MAP_INVALID_UTF8_NOT
1364 return wxCONV_FAILED
;
1370 if ( isNulTerminated
)
1372 // Add the trailing NUL in this case if we have a large enough buffer.
1373 if ( buf
&& (len
< n
) )
1376 // And count it in any case.
1383 static inline bool isoctal(wchar_t wch
)
1385 return L
'0' <= wch
&& wch
<= L
'7';
1388 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1389 const wchar_t *psz
, size_t srcLen
) const
1391 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1392 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1396 // The length can be either given explicitly or computed implicitly for the
1397 // NUL-terminated strings.
1398 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
1399 while ((isNulTerminated
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1404 // cast is ok for WC_UTF16
1405 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1406 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1408 cc
= (*psz
++) & 0x7fffffff;
1411 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1412 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1415 *buf
++ = (char)(cc
- wxUnicodePUA
);
1418 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1419 && cc
== L
'\\' && psz
[0] == L
'\\' )
1426 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1428 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1432 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1433 (psz
[1] - L
'0') * 010 +
1443 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1459 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1461 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1467 if ( isNulTerminated
)
1469 // Add the trailing NUL in this case if we have a large enough buffer.
1470 if ( buf
&& (len
< n
) )
1473 // And count it in any case.
1480 // ============================================================================
1482 // ============================================================================
1484 #ifdef WORDS_BIGENDIAN
1485 #define wxMBConvUTF16straight wxMBConvUTF16BE
1486 #define wxMBConvUTF16swap wxMBConvUTF16LE
1488 #define wxMBConvUTF16swap wxMBConvUTF16BE
1489 #define wxMBConvUTF16straight wxMBConvUTF16LE
1493 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1495 if ( srcLen
== wxNO_LEN
)
1497 // count the number of bytes in input, including the trailing NULs
1498 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1499 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1502 srcLen
*= BYTES_PER_CHAR
;
1504 else // we already have the length
1506 // we can only convert an entire number of UTF-16 characters
1507 if ( srcLen
% BYTES_PER_CHAR
)
1508 return wxCONV_FAILED
;
1514 // case when in-memory representation is UTF-16 too
1517 // ----------------------------------------------------------------------------
1518 // conversions without endianness change
1519 // ----------------------------------------------------------------------------
1522 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1523 const char *src
, size_t srcLen
) const
1525 // set up the scene for using memcpy() (which is presumably more efficient
1526 // than copying the bytes one by one)
1527 srcLen
= GetLength(src
, srcLen
);
1528 if ( srcLen
== wxNO_LEN
)
1529 return wxCONV_FAILED
;
1531 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1534 if ( dstLen
< inLen
)
1535 return wxCONV_FAILED
;
1537 memcpy(dst
, src
, srcLen
);
1544 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1545 const wchar_t *src
, size_t srcLen
) const
1547 if ( srcLen
== wxNO_LEN
)
1548 srcLen
= wxWcslen(src
) + 1;
1550 srcLen
*= BYTES_PER_CHAR
;
1554 if ( dstLen
< srcLen
)
1555 return wxCONV_FAILED
;
1557 memcpy(dst
, src
, srcLen
);
1563 // ----------------------------------------------------------------------------
1564 // endian-reversing conversions
1565 // ----------------------------------------------------------------------------
1568 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1569 const char *src
, size_t srcLen
) const
1571 srcLen
= GetLength(src
, srcLen
);
1572 if ( srcLen
== wxNO_LEN
)
1573 return wxCONV_FAILED
;
1575 srcLen
/= BYTES_PER_CHAR
;
1579 if ( dstLen
< srcLen
)
1580 return wxCONV_FAILED
;
1582 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1583 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1585 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1593 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1594 const wchar_t *src
, size_t srcLen
) const
1596 if ( srcLen
== wxNO_LEN
)
1597 srcLen
= wxWcslen(src
) + 1;
1599 srcLen
*= BYTES_PER_CHAR
;
1603 if ( dstLen
< srcLen
)
1604 return wxCONV_FAILED
;
1606 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1607 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1609 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1616 #else // !WC_UTF16: wchar_t is UTF-32
1618 // ----------------------------------------------------------------------------
1619 // conversions without endianness change
1620 // ----------------------------------------------------------------------------
1623 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1624 const char *src
, size_t srcLen
) const
1626 srcLen
= GetLength(src
, srcLen
);
1627 if ( srcLen
== wxNO_LEN
)
1628 return wxCONV_FAILED
;
1630 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1633 // optimization: return maximal space which could be needed for this
1634 // string even if the real size could be smaller if the buffer contains
1640 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1641 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1643 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1645 return wxCONV_FAILED
;
1647 if ( ++outLen
> dstLen
)
1648 return wxCONV_FAILED
;
1658 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1659 const wchar_t *src
, size_t srcLen
) const
1661 if ( srcLen
== wxNO_LEN
)
1662 srcLen
= wxWcslen(src
) + 1;
1665 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1666 for ( size_t n
= 0; n
< srcLen
; n
++ )
1668 wxUint16 cc
[2] = { 0 };
1669 const size_t numChars
= encode_utf16(*src
++, cc
);
1670 if ( numChars
== wxCONV_FAILED
)
1671 return wxCONV_FAILED
;
1673 outLen
+= numChars
* BYTES_PER_CHAR
;
1676 if ( outLen
> dstLen
)
1677 return wxCONV_FAILED
;
1680 if ( numChars
== 2 )
1682 // second character of a surrogate
1691 // ----------------------------------------------------------------------------
1692 // endian-reversing conversions
1693 // ----------------------------------------------------------------------------
1696 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1697 const char *src
, size_t srcLen
) const
1699 srcLen
= GetLength(src
, srcLen
);
1700 if ( srcLen
== wxNO_LEN
)
1701 return wxCONV_FAILED
;
1703 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1706 // optimization: return maximal space which could be needed for this
1707 // string even if the real size could be smaller if the buffer contains
1713 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1714 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1719 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1721 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1723 const size_t numChars
= decode_utf16(tmp
, ch
);
1724 if ( numChars
== wxCONV_FAILED
)
1725 return wxCONV_FAILED
;
1727 if ( numChars
== 2 )
1730 if ( ++outLen
> dstLen
)
1731 return wxCONV_FAILED
;
1741 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1742 const wchar_t *src
, size_t srcLen
) const
1744 if ( srcLen
== wxNO_LEN
)
1745 srcLen
= wxWcslen(src
) + 1;
1748 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1749 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1751 wxUint16 cc
[2] = { 0 };
1752 const size_t numChars
= encode_utf16(*src
, cc
);
1753 if ( numChars
== wxCONV_FAILED
)
1754 return wxCONV_FAILED
;
1756 outLen
+= numChars
* BYTES_PER_CHAR
;
1759 if ( outLen
> dstLen
)
1760 return wxCONV_FAILED
;
1762 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1763 if ( numChars
== 2 )
1765 // second character of a surrogate
1766 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1774 #endif // WC_UTF16/!WC_UTF16
1777 // ============================================================================
1779 // ============================================================================
1781 #ifdef WORDS_BIGENDIAN
1782 #define wxMBConvUTF32straight wxMBConvUTF32BE
1783 #define wxMBConvUTF32swap wxMBConvUTF32LE
1785 #define wxMBConvUTF32swap wxMBConvUTF32BE
1786 #define wxMBConvUTF32straight wxMBConvUTF32LE
1790 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1791 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1794 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1796 if ( srcLen
== wxNO_LEN
)
1798 // count the number of bytes in input, including the trailing NULs
1799 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1800 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1803 srcLen
*= BYTES_PER_CHAR
;
1805 else // we already have the length
1807 // we can only convert an entire number of UTF-32 characters
1808 if ( srcLen
% BYTES_PER_CHAR
)
1809 return wxCONV_FAILED
;
1815 // case when in-memory representation is UTF-16
1818 // ----------------------------------------------------------------------------
1819 // conversions without endianness change
1820 // ----------------------------------------------------------------------------
1823 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1824 const char *src
, size_t srcLen
) const
1826 srcLen
= GetLength(src
, srcLen
);
1827 if ( srcLen
== wxNO_LEN
)
1828 return wxCONV_FAILED
;
1830 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1831 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1833 for ( size_t n
= 0; n
< inLen
; n
++ )
1835 wxUint16 cc
[2] = { 0 };
1836 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1837 if ( numChars
== wxCONV_FAILED
)
1838 return wxCONV_FAILED
;
1843 if ( outLen
> dstLen
)
1844 return wxCONV_FAILED
;
1847 if ( numChars
== 2 )
1849 // second character of a surrogate
1859 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1860 const wchar_t *src
, size_t srcLen
) const
1862 if ( srcLen
== wxNO_LEN
)
1863 srcLen
= wxWcslen(src
) + 1;
1867 // optimization: return maximal space which could be needed for this
1868 // string instead of the exact amount which could be less if there are
1869 // any surrogates in the input
1871 // we consider that surrogates are rare enough to make it worthwhile to
1872 // avoid running the loop below at the cost of slightly extra memory
1874 return srcLen
* BYTES_PER_CHAR
;
1877 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1879 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1881 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1883 return wxCONV_FAILED
;
1885 outLen
+= BYTES_PER_CHAR
;
1887 if ( outLen
> dstLen
)
1888 return wxCONV_FAILED
;
1896 // ----------------------------------------------------------------------------
1897 // endian-reversing conversions
1898 // ----------------------------------------------------------------------------
1901 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1902 const char *src
, size_t srcLen
) const
1904 srcLen
= GetLength(src
, srcLen
);
1905 if ( srcLen
== wxNO_LEN
)
1906 return wxCONV_FAILED
;
1908 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1909 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1911 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1913 wxUint16 cc
[2] = { 0 };
1914 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1915 if ( numChars
== wxCONV_FAILED
)
1916 return wxCONV_FAILED
;
1921 if ( outLen
> dstLen
)
1922 return wxCONV_FAILED
;
1925 if ( numChars
== 2 )
1927 // second character of a surrogate
1937 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1938 const wchar_t *src
, size_t srcLen
) const
1940 if ( srcLen
== wxNO_LEN
)
1941 srcLen
= wxWcslen(src
) + 1;
1945 // optimization: return maximal space which could be needed for this
1946 // string instead of the exact amount which could be less if there are
1947 // any surrogates in the input
1949 // we consider that surrogates are rare enough to make it worthwhile to
1950 // avoid running the loop below at the cost of slightly extra memory
1952 return srcLen
*BYTES_PER_CHAR
;
1955 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1957 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1959 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1961 return wxCONV_FAILED
;
1963 outLen
+= BYTES_PER_CHAR
;
1965 if ( outLen
> dstLen
)
1966 return wxCONV_FAILED
;
1968 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1974 #else // !WC_UTF16: wchar_t is UTF-32
1976 // ----------------------------------------------------------------------------
1977 // conversions without endianness change
1978 // ----------------------------------------------------------------------------
1981 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1982 const char *src
, size_t srcLen
) const
1984 // use memcpy() as it should be much faster than hand-written loop
1985 srcLen
= GetLength(src
, srcLen
);
1986 if ( srcLen
== wxNO_LEN
)
1987 return wxCONV_FAILED
;
1989 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1992 if ( dstLen
< inLen
)
1993 return wxCONV_FAILED
;
1995 memcpy(dst
, src
, srcLen
);
2002 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
2003 const wchar_t *src
, size_t srcLen
) const
2005 if ( srcLen
== wxNO_LEN
)
2006 srcLen
= wxWcslen(src
) + 1;
2008 srcLen
*= BYTES_PER_CHAR
;
2012 if ( dstLen
< srcLen
)
2013 return wxCONV_FAILED
;
2015 memcpy(dst
, src
, srcLen
);
2021 // ----------------------------------------------------------------------------
2022 // endian-reversing conversions
2023 // ----------------------------------------------------------------------------
2026 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
2027 const char *src
, size_t srcLen
) const
2029 srcLen
= GetLength(src
, srcLen
);
2030 if ( srcLen
== wxNO_LEN
)
2031 return wxCONV_FAILED
;
2033 srcLen
/= BYTES_PER_CHAR
;
2037 if ( dstLen
< srcLen
)
2038 return wxCONV_FAILED
;
2040 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
2041 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
2043 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
2051 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
2052 const wchar_t *src
, size_t srcLen
) const
2054 if ( srcLen
== wxNO_LEN
)
2055 srcLen
= wxWcslen(src
) + 1;
2057 srcLen
*= BYTES_PER_CHAR
;
2061 if ( dstLen
< srcLen
)
2062 return wxCONV_FAILED
;
2064 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
2065 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
2067 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
2074 #endif // WC_UTF16/!WC_UTF16
2077 // ============================================================================
2078 // The classes doing conversion using the iconv_xxx() functions
2079 // ============================================================================
2083 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2084 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2085 // (unless there's yet another bug in glibc) the only case when iconv()
2086 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2087 // left in the input buffer -- when _real_ error occurs,
2088 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2090 // [This bug does not appear in glibc 2.2.]
2091 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2092 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2093 (errno != E2BIG || bufLeft != 0))
2095 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2098 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2100 #define ICONV_T_INVALID ((iconv_t)-1)
2102 #if SIZEOF_WCHAR_T == 4
2103 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2104 #define WC_ENC wxFONTENCODING_UTF32
2105 #elif SIZEOF_WCHAR_T == 2
2106 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2107 #define WC_ENC wxFONTENCODING_UTF16
2108 #else // sizeof(wchar_t) != 2 nor 4
2109 // does this ever happen?
2110 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2113 // ----------------------------------------------------------------------------
2114 // wxMBConv_iconv: encapsulates an iconv character set
2115 // ----------------------------------------------------------------------------
2117 class wxMBConv_iconv
: public wxMBConv
2120 wxMBConv_iconv(const char *name
);
2121 virtual ~wxMBConv_iconv();
2123 // implement base class virtual methods
2124 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
2125 const char *src
, size_t srcLen
= wxNO_LEN
) const;
2126 virtual size_t FromWChar(char *dst
, size_t dstLen
,
2127 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
2128 virtual size_t GetMBNulLen() const;
2130 #if wxUSE_UNICODE_UTF8
2131 virtual bool IsUTF8() const;
2134 virtual wxMBConv
*Clone() const
2136 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
);
2137 p
->m_minMBCharWidth
= m_minMBCharWidth
;
2142 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
2145 // the iconv handlers used to translate from multibyte
2146 // to wide char and in the other direction
2151 // guards access to m2w and w2m objects
2152 wxMutex m_iconvMutex
;
2156 // the name (for iconv_open()) of a wide char charset -- if none is
2157 // available on this machine, it will remain NULL
2158 static wxString ms_wcCharsetName
;
2160 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2161 // different endian-ness than the native one
2162 static bool ms_wcNeedsSwap
;
2165 // name of the encoding handled by this conversion
2168 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2170 size_t m_minMBCharWidth
;
2173 // make the constructor available for unit testing
2174 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2176 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2177 if ( !result
->IsOk() )
2186 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2187 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2189 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2190 : m_name(wxStrdup(name
))
2192 m_minMBCharWidth
= 0;
2194 // check for charset that represents wchar_t:
2195 if ( ms_wcCharsetName
.empty() )
2197 wxLogTrace(TRACE_STRCONV
, wxT("Looking for wide char codeset:"));
2200 const wxChar
*const *names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2201 #else // !wxUSE_FONTMAP
2202 static const wxChar
*const names_static
[] =
2204 #if SIZEOF_WCHAR_T == 4
2206 #elif SIZEOF_WCHAR_T == 2
2211 const wxChar
*const *names
= names_static
;
2212 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2214 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2216 const wxString
nameCS(*names
);
2218 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2219 wxString
nameXE(nameCS
);
2221 #ifdef WORDS_BIGENDIAN
2222 nameXE
+= wxT("BE");
2223 #else // little endian
2224 nameXE
+= wxT("LE");
2227 wxLogTrace(TRACE_STRCONV
, wxT(" trying charset \"%s\""),
2230 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2231 if ( m2w
== ICONV_T_INVALID
)
2233 // try charset w/o bytesex info (e.g. "UCS4")
2234 wxLogTrace(TRACE_STRCONV
, wxT(" trying charset \"%s\""),
2236 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2238 // and check for bytesex ourselves:
2239 if ( m2w
!= ICONV_T_INVALID
)
2241 char buf
[2], *bufPtr
;
2250 outsz
= SIZEOF_WCHAR_T
* 2;
2251 char* wbufPtr
= (char*)wbuf
;
2255 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2258 if (ICONV_FAILED(res
, insz
))
2260 wxLogLastError(wxT("iconv"));
2261 wxLogError(_("Conversion to charset '%s' doesn't work."),
2264 else // ok, can convert to this encoding, remember it
2266 ms_wcCharsetName
= nameCS
;
2267 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2271 else // use charset not requiring byte swapping
2273 ms_wcCharsetName
= nameXE
;
2277 wxLogTrace(TRACE_STRCONV
,
2278 wxT("iconv wchar_t charset is \"%s\"%s"),
2279 ms_wcCharsetName
.empty() ? wxString("<none>")
2281 ms_wcNeedsSwap
? wxT(" (needs swap)")
2284 else // we already have ms_wcCharsetName
2286 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2289 if ( ms_wcCharsetName
.empty() )
2291 w2m
= ICONV_T_INVALID
;
2295 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2296 if ( w2m
== ICONV_T_INVALID
)
2298 wxLogTrace(TRACE_STRCONV
,
2299 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2300 ms_wcCharsetName
.c_str(), name
);
2305 wxMBConv_iconv::~wxMBConv_iconv()
2307 free(const_cast<char *>(m_name
));
2309 if ( m2w
!= ICONV_T_INVALID
)
2311 if ( w2m
!= ICONV_T_INVALID
)
2316 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2317 const char *src
, size_t srcLen
) const
2319 if ( srcLen
== wxNO_LEN
)
2321 // find the string length: notice that must be done differently for
2322 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2324 const size_t nulLen
= GetMBNulLen();
2328 return wxCONV_FAILED
;
2331 srcLen
= strlen(src
); // arguably more optimized than our version
2336 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2337 // but they also have to start at character boundary and not
2338 // span two adjacent characters
2340 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2346 // when we're determining the length of the string ourselves we count
2347 // the terminating NUL(s) as part of it and always NUL-terminate the
2352 // we express length in the number of (wide) characters but iconv always
2353 // counts buffer sizes it in bytes
2354 dstLen
*= SIZEOF_WCHAR_T
;
2357 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2358 // Unfortunately there are a couple of global wxCSConv objects such as
2359 // wxConvLocal that are used all over wx code, so we have to make sure
2360 // the handle is used by at most one thread at the time. Otherwise
2361 // only a few wx classes would be safe to use from non-main threads
2362 // as MB<->WC conversion would fail "randomly".
2363 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2364 #endif // wxUSE_THREADS
2367 const char *pszPtr
= src
;
2371 char* bufPtr
= (char*)dst
;
2373 // have destination buffer, convert there
2374 size_t dstLenOrig
= dstLen
;
2376 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2379 // convert the number of bytes converted as returned by iconv to the
2380 // number of (wide) characters converted that we need
2381 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2385 // convert to native endianness
2386 for ( unsigned i
= 0; i
< res
; i
++ )
2387 dst
[i
] = WC_BSWAP(dst
[i
]);
2390 else // no destination buffer
2392 // convert using temp buffer to calculate the size of the buffer needed
2398 char* bufPtr
= (char*)tbuf
;
2399 dstLen
= 8 * SIZEOF_WCHAR_T
;
2402 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2405 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2407 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2410 if (ICONV_FAILED(cres
, srcLen
))
2412 //VS: it is ok if iconv fails, hence trace only
2413 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2414 return wxCONV_FAILED
;
2420 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2421 const wchar_t *src
, size_t srcLen
) const
2424 // NB: explained in MB2WC
2425 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2428 if ( srcLen
== wxNO_LEN
)
2429 srcLen
= wxWcslen(src
) + 1;
2431 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2432 size_t outbuflen
= dstLen
;
2435 wchar_t *tmpbuf
= 0;
2439 // need to copy to temp buffer to switch endianness
2440 // (doing WC_BSWAP twice on the original buffer won't work, as it
2441 // could be in read-only memory, or be accessed in some other thread)
2442 tmpbuf
= (wchar_t *)malloc(inbuflen
);
2443 for ( size_t i
= 0; i
< srcLen
; i
++ )
2444 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2449 char* inbuf
= (char*)src
;
2452 // have destination buffer, convert there
2453 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2455 res
= dstLen
- outbuflen
;
2457 else // no destination buffer
2459 // convert using temp buffer to calculate the size of the buffer needed
2465 outbuflen
= WXSIZEOF(tbuf
);
2467 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2469 res
+= WXSIZEOF(tbuf
) - outbuflen
;
2471 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2479 if (ICONV_FAILED(cres
, inbuflen
))
2481 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2482 return wxCONV_FAILED
;
2488 size_t wxMBConv_iconv::GetMBNulLen() const
2490 if ( m_minMBCharWidth
== 0 )
2492 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2495 // NB: explained in MB2WC
2496 wxMutexLocker
lock(self
->m_iconvMutex
);
2499 const wchar_t *wnul
= L
"";
2500 char buf
[8]; // should be enough for NUL in any encoding
2501 size_t inLen
= sizeof(wchar_t),
2502 outLen
= WXSIZEOF(buf
);
2503 char *inBuff
= (char *)wnul
;
2504 char *outBuff
= buf
;
2505 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2507 self
->m_minMBCharWidth
= (size_t)-1;
2511 self
->m_minMBCharWidth
= outBuff
- buf
;
2515 return m_minMBCharWidth
;
2518 #if wxUSE_UNICODE_UTF8
2519 bool wxMBConv_iconv::IsUTF8() const
2521 return wxStricmp(m_name
, "UTF-8") == 0 ||
2522 wxStricmp(m_name
, "UTF8") == 0;
2526 #endif // HAVE_ICONV
2529 // ============================================================================
2530 // Win32 conversion classes
2531 // ============================================================================
2533 #ifdef wxHAVE_WIN32_MB2WC
2537 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2538 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2541 class wxMBConv_win32
: public wxMBConv
2546 m_CodePage
= CP_ACP
;
2547 m_minMBCharWidth
= 0;
2550 wxMBConv_win32(const wxMBConv_win32
& conv
)
2553 m_CodePage
= conv
.m_CodePage
;
2554 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2558 wxMBConv_win32(const char* name
)
2560 m_CodePage
= wxCharsetToCodepage(name
);
2561 m_minMBCharWidth
= 0;
2564 wxMBConv_win32(wxFontEncoding encoding
)
2566 m_CodePage
= wxEncodingToCodepage(encoding
);
2567 m_minMBCharWidth
= 0;
2569 #endif // wxUSE_FONTMAP
2571 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2573 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2574 // the behaviour is not compatible with the Unix version (using iconv)
2575 // and break the library itself, e.g. wxTextInputStream::NextChar()
2576 // wouldn't work if reading an incomplete MB char didn't result in an
2579 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2580 // Win XP or newer and it is not supported for UTF-[78] so we always
2581 // use our own conversions in this case. See
2582 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2583 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2584 if ( m_CodePage
== CP_UTF8
)
2586 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2589 if ( m_CodePage
== CP_UTF7
)
2591 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2595 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2596 IsAtLeastWin2kSP4() )
2598 flags
= MB_ERR_INVALID_CHARS
;
2601 const size_t len
= ::MultiByteToWideChar
2603 m_CodePage
, // code page
2604 flags
, // flags: fall on error
2605 psz
, // input string
2606 -1, // its length (NUL-terminated)
2607 buf
, // output string
2608 buf
? n
: 0 // size of output buffer
2612 // function totally failed
2613 return wxCONV_FAILED
;
2616 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2617 // check if we succeeded, by doing a double trip:
2618 if ( !flags
&& buf
)
2620 const size_t mbLen
= strlen(psz
);
2621 wxCharBuffer
mbBuf(mbLen
);
2622 if ( ::WideCharToMultiByte
2629 mbLen
+ 1, // size in bytes, not length
2633 strcmp(mbBuf
, psz
) != 0 )
2635 // we didn't obtain the same thing we started from, hence
2636 // the conversion was lossy and we consider that it failed
2637 return wxCONV_FAILED
;
2641 // note that it returns count of written chars for buf != NULL and size
2642 // of the needed buffer for buf == NULL so in either case the length of
2643 // the string (which never includes the terminating NUL) is one less
2647 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2650 we have a problem here: by default, WideCharToMultiByte() may
2651 replace characters unrepresentable in the target code page with bad
2652 quality approximations such as turning "1/2" symbol (U+00BD) into
2653 "1" for the code pages which don't have it and we, obviously, want
2654 to avoid this at any price
2656 the trouble is that this function does it _silently_, i.e. it won't
2657 even tell us whether it did or not... Win98/2000 and higher provide
2658 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2659 we have to resort to a round trip, i.e. check that converting back
2660 results in the same string -- this is, of course, expensive but
2661 otherwise we simply can't be sure to not garble the data.
2664 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2665 // it doesn't work with CJK encodings (which we test for rather roughly
2666 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2668 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2671 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2673 // it's our lucky day
2674 flags
= WC_NO_BEST_FIT_CHARS
;
2675 pUsedDef
= &usedDef
;
2677 else // old system or unsupported encoding
2683 const size_t len
= ::WideCharToMultiByte
2685 m_CodePage
, // code page
2686 flags
, // either none or no best fit
2687 pwz
, // input string
2688 -1, // it is (wide) NUL-terminated
2689 buf
, // output buffer
2690 buf
? n
: 0, // and its size
2691 NULL
, // default "replacement" char
2692 pUsedDef
// [out] was it used?
2697 // function totally failed
2698 return wxCONV_FAILED
;
2701 // we did something, check if we really succeeded
2704 // check if the conversion failed, i.e. if any replacements
2707 return wxCONV_FAILED
;
2709 else // we must resort to double tripping...
2711 // first we need to ensure that we really have the MB data: this is
2712 // not the case if we're called with NULL buffer, in which case we
2713 // need to do the conversion yet again
2714 wxCharBuffer bufDef
;
2717 bufDef
= wxCharBuffer(len
);
2718 buf
= bufDef
.data();
2719 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2720 buf
, len
, NULL
, NULL
) )
2721 return wxCONV_FAILED
;
2726 wxWCharBuffer
wcBuf(n
);
2727 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2728 wcscmp(wcBuf
, pwz
) != 0 )
2730 // we didn't obtain the same thing we started from, hence
2731 // the conversion was lossy and we consider that it failed
2732 return wxCONV_FAILED
;
2736 // see the comment above for the reason of "len - 1"
2740 virtual size_t GetMBNulLen() const
2742 if ( m_minMBCharWidth
== 0 )
2744 int len
= ::WideCharToMultiByte
2746 m_CodePage
, // code page
2748 L
"", // input string
2749 1, // translate just the NUL
2750 NULL
, // output buffer
2752 NULL
, // no replacement char
2753 NULL
// [out] don't care if it was used
2756 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2760 wxLogDebug(wxT("Unexpected NUL length %d"), len
);
2761 self
->m_minMBCharWidth
= (size_t)-1;
2765 self
->m_minMBCharWidth
= (size_t)-1;
2771 self
->m_minMBCharWidth
= len
;
2776 return m_minMBCharWidth
;
2779 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2781 bool IsOk() const { return m_CodePage
!= -1; }
2784 static bool CanUseNoBestFit()
2786 static int s_isWin98Or2k
= -1;
2788 if ( s_isWin98Or2k
== -1 )
2791 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2793 case wxOS_WINDOWS_9X
:
2794 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2797 case wxOS_WINDOWS_NT
:
2798 s_isWin98Or2k
= verMaj
>= 5;
2802 // unknown: be conservative by default
2807 wxASSERT_MSG( s_isWin98Or2k
!= -1, wxT("should be set above") );
2810 return s_isWin98Or2k
== 1;
2813 static bool IsAtLeastWin2kSP4()
2818 static int s_isAtLeastWin2kSP4
= -1;
2820 if ( s_isAtLeastWin2kSP4
== -1 )
2822 OSVERSIONINFOEX ver
;
2824 memset(&ver
, 0, sizeof(ver
));
2825 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2826 GetVersionEx((OSVERSIONINFO
*)&ver
);
2828 s_isAtLeastWin2kSP4
=
2829 ((ver
.dwMajorVersion
> 5) || // Vista+
2830 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2831 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2832 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2836 return s_isAtLeastWin2kSP4
== 1;
2841 // the code page we're working with
2844 // cached result of GetMBNulLen(), set to 0 initially meaning
2846 size_t m_minMBCharWidth
;
2849 #endif // wxHAVE_WIN32_MB2WC
2852 // ============================================================================
2853 // wxEncodingConverter based conversion classes
2854 // ============================================================================
2858 class wxMBConv_wxwin
: public wxMBConv
2863 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2864 // The wxMBConv_cf class does a better job.
2865 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2866 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2867 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2871 // temporarily just use wxEncodingConverter stuff,
2872 // so that it works while a better implementation is built
2873 wxMBConv_wxwin(const char* name
)
2876 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2878 m_enc
= wxFONTENCODING_SYSTEM
;
2883 wxMBConv_wxwin(wxFontEncoding enc
)
2890 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2892 size_t inbuf
= strlen(psz
);
2895 if (!m2w
.Convert(psz
, buf
))
2896 return wxCONV_FAILED
;
2901 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2903 const size_t inbuf
= wxWcslen(psz
);
2906 if (!w2m
.Convert(psz
, buf
))
2907 return wxCONV_FAILED
;
2913 virtual size_t GetMBNulLen() const
2917 case wxFONTENCODING_UTF16BE
:
2918 case wxFONTENCODING_UTF16LE
:
2921 case wxFONTENCODING_UTF32BE
:
2922 case wxFONTENCODING_UTF32LE
:
2930 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2932 bool IsOk() const { return m_ok
; }
2935 wxFontEncoding m_enc
;
2936 wxEncodingConverter m2w
, w2m
;
2939 // were we initialized successfully?
2942 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin
);
2945 // make the constructors available for unit testing
2946 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2948 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2949 if ( !result
->IsOk() )
2958 #endif // wxUSE_FONTMAP
2960 // ============================================================================
2961 // wxCSConv implementation
2962 // ============================================================================
2964 void wxCSConv::Init()
2970 void wxCSConv::SetEncoding(wxFontEncoding encoding
)
2974 case wxFONTENCODING_MAX
:
2975 case wxFONTENCODING_SYSTEM
:
2978 // It's ok to not have encoding value if we have a name for it.
2979 m_encoding
= wxFONTENCODING_SYSTEM
;
2981 else // No name neither.
2983 // Fall back to the system default encoding in this case (not
2984 // sure how much sense does this make but this is how the old
2985 // code used to behave).
2987 m_encoding
= wxLocale::GetSystemEncoding();
2988 if ( m_encoding
== wxFONTENCODING_SYSTEM
)
2989 #endif // wxUSE_INTL
2990 m_encoding
= wxFONTENCODING_ISO8859_1
;
2994 case wxFONTENCODING_DEFAULT
:
2995 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2996 m_encoding
= wxFONTENCODING_ISO8859_1
;
3000 // Just use the provided encoding.
3001 m_encoding
= encoding
;
3005 wxCSConv::wxCSConv(const wxString
& charset
)
3009 if ( !charset
.empty() )
3011 SetName(charset
.ToAscii());
3015 SetEncoding(wxFontMapperBase::GetEncodingFromName(charset
));
3017 SetEncoding(wxFONTENCODING_SYSTEM
);
3020 m_convReal
= DoCreate();
3023 wxCSConv::wxCSConv(wxFontEncoding encoding
)
3025 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
3027 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
3029 encoding
= wxFONTENCODING_SYSTEM
;
3034 SetEncoding(encoding
);
3036 m_convReal
= DoCreate();
3039 wxCSConv::~wxCSConv()
3044 wxCSConv::wxCSConv(const wxCSConv
& conv
)
3049 SetName(conv
.m_name
);
3050 SetEncoding(conv
.m_encoding
);
3052 m_convReal
= DoCreate();
3055 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
3059 SetName(conv
.m_name
);
3060 SetEncoding(conv
.m_encoding
);
3062 m_convReal
= DoCreate();
3067 void wxCSConv::Clear()
3072 wxDELETE(m_convReal
);
3075 void wxCSConv::SetName(const char *charset
)
3078 m_name
= wxStrdup(charset
);
3083 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3084 wxEncodingNameCache
);
3086 static wxEncodingNameCache gs_nameCache
;
3089 wxMBConv
*wxCSConv::DoCreate() const
3092 wxLogTrace(TRACE_STRCONV
,
3093 wxT("creating conversion for %s"),
3095 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
3096 #endif // wxUSE_FONTMAP
3098 // check for the special case of ASCII or ISO8859-1 charset: as we have
3099 // special knowledge of it anyhow, we don't need to create a special
3100 // conversion object
3101 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3103 // don't convert at all
3107 // we trust OS to do conversion better than we can so try external
3108 // conversion methods first
3110 // the full order is:
3111 // 1. OS conversion (iconv() under Unix or Win32 API)
3112 // 2. hard coded conversions for UTF
3113 // 3. wxEncodingConverter as fall back
3119 #endif // !wxUSE_FONTMAP
3122 wxFontEncoding
encoding(m_encoding
);
3127 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
3135 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3136 #endif // wxUSE_FONTMAP
3140 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3141 if ( it
!= gs_nameCache
.end() )
3143 if ( it
->second
.empty() )
3146 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
3153 const wxChar
* const* names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3154 // CS : in case this does not return valid names (eg for MacRoman)
3155 // encoding got a 'failure' entry in the cache all the same,
3156 // although it just has to be created using a different method, so
3157 // only store failed iconv creation attempts (or perhaps we
3158 // shoulnd't do this at all ?)
3159 if ( names
[0] != NULL
)
3161 for ( ; *names
; ++names
)
3163 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3164 // will need changes that will obsolete this
3165 wxString
name(*names
);
3166 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
3169 gs_nameCache
[encoding
] = *names
;
3176 gs_nameCache
[encoding
] = wxT(""); // cache the failure
3179 #endif // wxUSE_FONTMAP
3181 #endif // HAVE_ICONV
3183 #ifdef wxHAVE_WIN32_MB2WC
3186 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3187 : new wxMBConv_win32(m_encoding
);
3196 #endif // wxHAVE_WIN32_MB2WC
3200 // leave UTF16 and UTF32 to the built-ins of wx
3201 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3202 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3205 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3206 : new wxMBConv_cf(m_encoding
);
3208 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3217 #endif // __DARWIN__
3220 wxFontEncoding enc
= m_encoding
;
3222 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3224 // use "false" to suppress interactive dialogs -- we can be called from
3225 // anywhere and popping up a dialog from here is the last thing we want to
3227 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3229 #endif // wxUSE_FONTMAP
3233 case wxFONTENCODING_UTF7
:
3234 return new wxMBConvUTF7
;
3236 case wxFONTENCODING_UTF8
:
3237 return new wxMBConvUTF8
;
3239 case wxFONTENCODING_UTF16BE
:
3240 return new wxMBConvUTF16BE
;
3242 case wxFONTENCODING_UTF16LE
:
3243 return new wxMBConvUTF16LE
;
3245 case wxFONTENCODING_UTF32BE
:
3246 return new wxMBConvUTF32BE
;
3248 case wxFONTENCODING_UTF32LE
:
3249 return new wxMBConvUTF32LE
;
3252 // nothing to do but put here to suppress gcc warnings
3259 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3260 : new wxMBConv_wxwin(m_encoding
);
3267 wxLogTrace(TRACE_STRCONV
,
3268 wxT("encoding \"%s\" is not supported by this system"),
3269 (m_name
? wxString(m_name
)
3270 : wxFontMapperBase::GetEncodingName(m_encoding
)));
3271 #endif // wxUSE_FONTMAP
3276 bool wxCSConv::IsOk() const
3278 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3279 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3280 return true; // always ok as we do it ourselves
3282 // m_convReal->IsOk() is called at its own creation, so we know it must
3283 // be ok if m_convReal is non-NULL
3284 return m_convReal
!= NULL
;
3287 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3288 const char *src
, size_t srcLen
) const
3291 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3294 if ( srcLen
== wxNO_LEN
)
3295 srcLen
= strlen(src
) + 1; // take trailing NUL too
3299 if ( dstLen
< srcLen
)
3300 return wxCONV_FAILED
;
3302 for ( size_t n
= 0; n
< srcLen
; n
++ )
3303 dst
[n
] = (unsigned char)(src
[n
]);
3309 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3310 const wchar_t *src
, size_t srcLen
) const
3313 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3316 if ( srcLen
== wxNO_LEN
)
3317 srcLen
= wxWcslen(src
) + 1;
3321 if ( dstLen
< srcLen
)
3322 return wxCONV_FAILED
;
3324 for ( size_t n
= 0; n
< srcLen
; n
++ )
3326 if ( src
[n
] > 0xFF )
3327 return wxCONV_FAILED
;
3329 dst
[n
] = (char)src
[n
];
3333 else // still need to check the input validity
3335 for ( size_t n
= 0; n
< srcLen
; n
++ )
3337 if ( src
[n
] > 0xFF )
3338 return wxCONV_FAILED
;
3345 size_t wxCSConv::GetMBNulLen() const
3348 return m_convReal
->GetMBNulLen();
3350 // otherwise, we are ISO-8859-1
3354 #if wxUSE_UNICODE_UTF8
3355 bool wxCSConv::IsUTF8() const
3358 return m_convReal
->IsUTF8();
3360 // otherwise, we are ISO-8859-1
3368 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3371 return wxWCharBuffer();
3373 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3375 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3377 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3382 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3385 return wxCharBuffer();
3387 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3389 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3394 #endif // wxUSE_UNICODE
3396 // ----------------------------------------------------------------------------
3398 // ----------------------------------------------------------------------------
3400 // NB: The reason why we create converted objects in this convoluted way,
3401 // using a factory function instead of global variable, is that they
3402 // may be used at static initialization time (some of them are used by
3403 // wxString ctors and there may be a global wxString object). In other
3404 // words, possibly _before_ the converter global object would be
3411 #undef wxConvISO8859_1
3413 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3414 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3415 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3417 static impl_klass name##Obj ctor_args; \
3418 return &name##Obj; \
3420 /* this ensures that all global converter objects are created */ \
3421 /* by the time static initialization is done, i.e. before any */ \
3422 /* thread is launched: */ \
3423 static klass* gs_##name##instance = wxGet_##name##Ptr()
3425 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3426 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3429 // disable warning "variable 'xxx' was declared but never referenced"
3430 #pragma warning(disable: 177)
3434 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3435 #elif 0 // defined(__WXOSX__)
3436 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_cf
, wxConvLibc
, (wxFONTENCODING_UTF8
));
3438 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3441 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3442 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3443 // provokes an error message about "not enough macro parameters"; and we
3444 // can't use "()" here as the name##Obj declaration would be parsed as a
3445 // function declaration then, so use a semicolon and live with an extra
3446 // empty statement (and hope that no compilers warns about this)
3447 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3448 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3450 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3451 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3453 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3454 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3457 // It is important to use this conversion object under Darwin as it ensures
3458 // that Unicode strings are (re)composed correctly even though xnu kernel uses
3459 // decomposed form internally (at least for the file names).
3460 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3463 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3466 #else // !__DARWIN__
3467 wxGet_wxConvLibcPtr();
3468 #endif // __DARWIN__/!__DARWIN__