1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV wxT("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(reinterpret_cast<const wxUint16
*>(*pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten
= 0;
176 // the number of NULs terminating this string
177 size_t nulLen
= 0; // not really needed, but just to avoid warnings
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
185 if ( srcLen
!= wxNO_LEN
)
187 // we need to know how to find the end of this string
188 nulLen
= GetMBNulLen();
189 if ( nulLen
== wxCONV_FAILED
)
190 return wxCONV_FAILED
;
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
197 char * const p
= bufTmp
.data();
198 memcpy(p
, src
, srcLen
);
199 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
205 srcEnd
= src
+ srcLen
;
207 else // quit after the first loop iteration
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
229 // try to convert the current chunk
230 size_t lenChunk
= MB2WC(NULL
, src
, 0);
231 if ( lenChunk
== wxCONV_FAILED
)
232 return wxCONV_FAILED
;
234 dstWritten
+= lenChunk
;
240 // nothing left in the input string, conversion succeeded
246 if ( dstWritten
> dstLen
)
247 return wxCONV_FAILED
;
249 // +1 is for trailing NUL
250 if ( MB2WC(dst
, src
, lenChunk
+ 1) == wxCONV_FAILED
)
251 return wxCONV_FAILED
;
260 // we convert just one chunk in this case as this is the entire
261 // string anyhow (and we don't count the trailing NUL in this case)
265 // advance the input pointer past the end of this chunk: notice that we
266 // will always stop before srcEnd because we know that the chunk is
267 // always properly NUL-terminated
268 while ( NotAllNULs(src
, nulLen
) )
270 // notice that we must skip over multiple bytes here as we suppose
271 // that if NUL takes 2 or 4 bytes, then all the other characters do
272 // too and so if advanced by a single byte we might erroneously
273 // detect sequences of NUL bytes in the middle of the input
277 // if the buffer ends before this NUL, we shouldn't count it in our
278 // output so skip the code below
282 // do count this terminator as it's inside the buffer we convert
287 src
+= nulLen
; // skip the terminator itself
297 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
298 const wchar_t *src
, size_t srcLen
) const
300 // the number of chars [which would be] written to dst [if it were not NULL]
301 size_t dstWritten
= 0;
303 // if we don't know its length we have no choice but to assume that it is
304 // NUL-terminated (notice that it can still be NUL-terminated even if
305 // explicit length is given but it doesn't change our return value)
306 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
308 // make a copy of the input string unless it is already properly
310 wxWCharBuffer bufTmp
;
311 if ( isNulTerminated
)
313 srcLen
= wxWcslen(src
) + 1;
315 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
317 // make a copy in order to properly NUL-terminate the string
318 bufTmp
= wxWCharBuffer(srcLen
);
319 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
323 const size_t lenNul
= GetMBNulLen();
324 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
326 src
++ /* skip L'\0' too */ )
328 // try to convert the current chunk
329 size_t lenChunk
= WC2MB(NULL
, src
, 0);
330 if ( lenChunk
== wxCONV_FAILED
)
331 return wxCONV_FAILED
;
333 dstWritten
+= lenChunk
;
335 const wchar_t * const
336 chunkEnd
= isNulTerminated
? srcEnd
- 1 : src
+ wxWcslen(src
);
338 // our return value accounts for the trailing NUL(s), unlike that of
339 // WC2MB(), however don't do it for the last NUL we artificially added
341 if ( chunkEnd
< srcEnd
)
342 dstWritten
+= lenNul
;
346 if ( dstWritten
> dstLen
)
347 return wxCONV_FAILED
;
349 // if we know that there is enough space in the destination buffer
350 // (because we accounted for lenNul in dstWritten above), we can
351 // convert directly in place -- but otherwise we need another
352 // temporary buffer to ensure that we don't overwrite the output
355 if ( chunkEnd
== srcEnd
)
357 dstBuf
= wxCharBuffer(lenChunk
+ lenNul
- 1);
358 dstTmp
= dstBuf
.data();
365 if ( WC2MB(dstTmp
, src
, lenChunk
+ lenNul
) == wxCONV_FAILED
)
366 return wxCONV_FAILED
;
370 // copy everything up to but excluding the terminating NUL(s)
371 // into the real output buffer
372 memcpy(dst
, dstTmp
, lenChunk
);
374 // micro-optimization: if dstTmp != dst it means that chunkEnd
375 // == srcEnd and so we're done, no need to update anything below
380 if ( chunkEnd
< srcEnd
)
390 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
392 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
393 if ( rc
!= wxCONV_FAILED
)
395 // ToWChar() returns the buffer length, i.e. including the trailing
396 // NUL, while this method doesn't take it into account
403 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
405 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
406 if ( rc
!= wxCONV_FAILED
)
414 wxMBConv::~wxMBConv()
416 // nothing to do here (necessary for Darwin linking probably)
419 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
423 // calculate the length of the buffer needed first
424 const size_t nLen
= ToWChar(NULL
, 0, psz
);
425 if ( nLen
!= wxCONV_FAILED
)
427 // now do the actual conversion
428 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
430 // +1 for the trailing NULL
431 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
436 return wxWCharBuffer();
439 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
443 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
444 if ( nLen
!= wxCONV_FAILED
)
446 wxCharBuffer
buf(nLen
- 1);
447 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
452 return wxCharBuffer();
456 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
458 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
459 if ( dstLen
!= wxCONV_FAILED
)
461 // notice that we allocate space for dstLen+1 wide characters here
462 // because we want the buffer to always be NUL-terminated, even if the
463 // input isn't (as otherwise the caller has no way to know its length)
464 wxWCharBuffer
wbuf(dstLen
);
465 wbuf
.data()[dstLen
] = L
'\0';
466 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
472 // we also need to handle NUL-terminated input strings
473 // specially: for them the output is the length of the string
474 // excluding the trailing NUL, however if we're asked to
475 // convert a specific number of characters we return the length
476 // of the resulting output even if it's NUL-terminated
477 if ( inLen
== wxNO_LEN
)
488 return wxWCharBuffer();
492 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
494 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
495 if ( dstLen
!= wxCONV_FAILED
)
497 const size_t nulLen
= GetMBNulLen();
499 // as above, ensure that the buffer is always NUL-terminated, even if
501 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
502 memset(buf
.data() + dstLen
, 0, nulLen
);
503 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
509 if ( inLen
== wxNO_LEN
)
511 // in this case both input and output are NUL-terminated
512 // and we're not supposed to count NUL
524 return wxCharBuffer();
527 const wxWCharBuffer
wxMBConv::cMB2WC(const wxScopedCharBuffer
& buf
) const
529 const size_t srcLen
= buf
.length();
532 const size_t dstLen
= ToWChar(NULL
, 0, buf
, srcLen
);
533 if ( dstLen
!= wxCONV_FAILED
)
535 wxWCharBuffer
wbuf(dstLen
);
536 wbuf
.data()[dstLen
] = L
'\0';
537 if ( ToWChar(wbuf
.data(), dstLen
, buf
, srcLen
) != wxCONV_FAILED
)
542 return wxWCharBuffer();
545 const wxCharBuffer
wxMBConv::cWC2MB(const wxScopedWCharBuffer
& wbuf
) const
547 const size_t srcLen
= wbuf
.length();
550 const size_t dstLen
= FromWChar(NULL
, 0, wbuf
, srcLen
);
551 if ( dstLen
!= wxCONV_FAILED
)
553 wxCharBuffer
buf(dstLen
);
554 buf
.data()[dstLen
] = '\0';
555 if ( FromWChar(buf
.data(), dstLen
, wbuf
, srcLen
) != wxCONV_FAILED
)
560 return wxCharBuffer();
563 // ----------------------------------------------------------------------------
565 // ----------------------------------------------------------------------------
567 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
569 return wxMB2WC(buf
, psz
, n
);
572 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
574 return wxWC2MB(buf
, psz
, n
);
577 // ----------------------------------------------------------------------------
578 // wxConvBrokenFileNames
579 // ----------------------------------------------------------------------------
583 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
585 if ( wxStricmp(charset
, wxT("UTF-8")) == 0 ||
586 wxStricmp(charset
, wxT("UTF8")) == 0 )
587 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
589 m_conv
= new wxCSConv(charset
);
594 // ----------------------------------------------------------------------------
596 // ----------------------------------------------------------------------------
598 // Implementation (C) 2004 Fredrik Roubert
600 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
603 // BASE64 decoding table
605 static const unsigned char utf7unb64
[] =
607 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
608 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
609 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
610 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
611 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
612 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
613 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
614 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
615 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
616 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
617 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
618 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
619 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
620 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
621 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
622 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
623 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
624 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
625 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
626 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
627 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
628 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
629 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
630 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
631 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
632 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
633 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
634 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
635 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
636 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
637 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
638 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
641 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
642 const char *src
, size_t srcLen
) const
644 DecoderState stateOrig
,
646 if ( srcLen
== wxNO_LEN
)
648 // convert the entire string, up to and including the trailing NUL
649 srcLen
= strlen(src
) + 1;
651 // when working on the entire strings we don't update nor use the shift
652 // state from the previous call
653 statePtr
= &stateOrig
;
655 else // when working with partial strings we do use the shift state
657 statePtr
= const_cast<DecoderState
*>(&m_stateDecoder
);
659 // also save the old state to be able to rollback to it on error
660 stateOrig
= m_stateDecoder
;
663 // but to simplify the code below we use this variable in both cases
664 DecoderState
& state
= *statePtr
;
667 // number of characters [which would have been] written to dst [if it were
671 const char * const srcEnd
= src
+ srcLen
;
673 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
675 const unsigned char cc
= *src
++;
677 if ( state
.IsShifted() )
679 const unsigned char dc
= utf7unb64
[cc
];
682 // end of encoded part, check that nothing was left: there can
683 // be up to 4 bits of 0 padding but nothing else (we also need
684 // to check isLSB as we count bits modulo 8 while a valid UTF-7
685 // encoded sequence must contain an integral number of UTF-16
687 if ( state
.isLSB
|| state
.bit
> 4 ||
688 (state
.accum
& ((1 << state
.bit
) - 1)) )
693 return wxCONV_FAILED
;
698 // re-parse this character normally below unless it's '-' which
699 // is consumed by the decoder
703 else // valid encoded character
705 // mini base64 decoder: each character is 6 bits
710 if ( state
.bit
>= 8 )
712 // got the full byte, consume it
714 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
718 // we've got the full word, output it
720 *dst
++ = (state
.msb
<< 8) | b
;
726 // just store it while we wait for LSB
734 if ( state
.IsDirect() )
736 // start of an encoded segment?
741 // just the encoded plus sign, don't switch to shifted mode
747 else if ( utf7unb64
[(unsigned)*src
] == 0xff )
749 // empty encoded chunks are not allowed
753 return wxCONV_FAILED
;
755 else // base-64 encoded chunk follows
762 // only printable 7 bit ASCII characters (with the exception of
763 // NUL, TAB, CR and LF) can be used directly
764 if ( cc
>= 0x7f || (cc
< ' ' &&
765 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
766 return wxCONV_FAILED
;
777 // as we didn't read any characters we should be called with the same
778 // data (followed by some more new data) again later so don't save our
782 return wxCONV_FAILED
;
789 // BASE64 encoding table
791 static const unsigned char utf7enb64
[] =
793 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
794 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
795 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
796 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
797 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
798 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
799 'w', 'x', 'y', 'z', '0', '1', '2', '3',
800 '4', '5', '6', '7', '8', '9', '+', '/'
804 // UTF-7 encoding table
806 // 0 - Set D (directly encoded characters)
807 // 1 - Set O (optional direct characters)
808 // 2 - whitespace characters (optional)
809 // 3 - special characters
811 static const unsigned char utf7encode
[128] =
813 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
814 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
815 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
817 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
819 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
823 static inline bool wxIsUTF7Direct(wchar_t wc
)
825 return wc
< 0x80 && utf7encode
[wc
] < 1;
828 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
829 const wchar_t *src
, size_t srcLen
) const
831 EncoderState stateOrig
,
833 if ( srcLen
== wxNO_LEN
)
835 // we don't apply the stored state when operating on entire strings at
837 statePtr
= &stateOrig
;
839 srcLen
= wxWcslen(src
) + 1;
841 else // do use the mode we left the output in previously
843 stateOrig
= m_stateEncoder
;
844 statePtr
= const_cast<EncoderState
*>(&m_stateEncoder
);
847 EncoderState
& state
= *statePtr
;
852 const wchar_t * const srcEnd
= src
+ srcLen
;
853 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
856 if ( wxIsUTF7Direct(cc
) )
858 if ( state
.IsShifted() )
860 // pad with zeros the last encoded block if necessary
864 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
879 else if ( cc
== '+' && state
.IsDirect() )
890 else if (((wxUint32
)cc
) > 0xffff)
892 // no surrogate pair generation (yet?)
893 return wxCONV_FAILED
;
898 if ( state
.IsDirect() )
907 // BASE64 encode string
910 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
913 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
915 for (state
.bit
+= 8; state
.bit
>= 6; )
919 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
924 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
932 // we need to restore the original encoder state if we were called just to
933 // calculate the amount of space needed as we will presumably be called
934 // again to really convert the data now
941 // ----------------------------------------------------------------------------
943 // ----------------------------------------------------------------------------
945 static const wxUint32 utf8_max
[]=
946 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
948 // boundaries of the private use area we use to (temporarily) remap invalid
949 // characters invalid in a UTF-8 encoded string
950 const wxUint32 wxUnicodePUA
= 0x100000;
951 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
953 // this table gives the length of the UTF-8 encoding from its first character:
954 const unsigned char tableUtf8Lengths
[256] = {
955 // single-byte sequences (ASCII):
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
962 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
965 // these are invalid:
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
969 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
972 // two-byte sequences:
973 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
974 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
976 // three-byte sequences:
977 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
979 // four-byte sequences:
980 4, 4, 4, 4, 4, // F0..F4
982 // these are invalid again (5- or 6-byte
983 // sequences and sequences for code points
984 // above U+10FFFF, as restricted by RFC 3629):
985 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
989 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
990 const char *src
, size_t srcLen
) const
992 wchar_t *out
= dstLen
? dst
: NULL
;
995 if ( srcLen
== wxNO_LEN
)
996 srcLen
= strlen(src
) + 1;
998 for ( const char *p
= src
; ; p
++ )
1000 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
1002 // all done successfully, just add the trailing NULL if we are not
1003 // using explicit length
1004 if ( srcLen
== wxNO_LEN
)
1020 if ( out
&& !dstLen
-- )
1024 unsigned char c
= *p
;
1028 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
1031 if ( srcLen
!= wxNO_LEN
)
1038 unsigned len
= tableUtf8Lengths
[c
];
1042 if ( srcLen
< len
) // the test works for wxNO_LEN too
1045 if ( srcLen
!= wxNO_LEN
)
1048 // Char. number range | UTF-8 octet sequence
1049 // (hexadecimal) | (binary)
1050 // ----------------------+----------------------------------------
1051 // 0000 0000 - 0000 007F | 0xxxxxxx
1052 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1053 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1054 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1056 // Code point value is stored in bits marked with 'x',
1057 // lowest-order bit of the value on the right side in the diagram
1058 // above. (from RFC 3629)
1060 // mask to extract lead byte's value ('x' bits above), by sequence
1062 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1064 // mask and value of lead byte's most significant bits, by length:
1065 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1066 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1068 len
--; // it's more convenient to work with 0-based length here
1070 // extract the lead byte's value bits:
1071 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
1074 code
= c
& leadValueMask
[len
];
1076 // all remaining bytes, if any, are handled in the same way
1077 // regardless of sequence's length:
1078 for ( ; len
; --len
)
1081 if ( (c
& 0xC0) != 0x80 )
1082 return wxCONV_FAILED
;
1090 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1091 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
1100 #endif // WC_UTF16/!WC_UTF16
1108 return wxCONV_FAILED
;
1112 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
1113 const wchar_t *src
, size_t srcLen
) const
1115 char *out
= dstLen
? dst
: NULL
;
1118 for ( const wchar_t *wp
= src
; ; wp
++ )
1120 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
) )
1122 // all done successfully, just add the trailing NULL if we are not
1123 // using explicit length
1124 if ( srcLen
== wxNO_LEN
)
1140 if ( srcLen
!= wxNO_LEN
)
1145 // cast is ok for WC_UTF16
1146 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1148 // skip the next char too as we decoded a surrogate
1151 #else // wchar_t is UTF-32
1152 code
= *wp
& 0x7fffffff;
1164 out
[0] = (char)code
;
1167 else if ( code
<= 0x07FF )
1175 // NB: this line takes 6 least significant bits, encodes them as
1176 // 10xxxxxx and discards them so that the next byte can be encoded:
1177 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1178 out
[0] = 0xC0 | code
;
1181 else if ( code
< 0xFFFF )
1189 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1190 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1191 out
[0] = 0xE0 | code
;
1194 else if ( code
<= 0x10FFFF )
1202 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1203 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1204 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1205 out
[0] = 0xF0 | code
;
1210 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1223 // we only get here if an error occurs during decoding
1224 return wxCONV_FAILED
;
1227 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1228 const char *psz
, size_t srcLen
) const
1230 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1231 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1235 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1237 const char *opsz
= psz
;
1238 bool invalid
= false;
1239 unsigned char cc
= *psz
++, fc
= cc
;
1241 for (cnt
= 0; fc
& 0x80; cnt
++)
1251 // escape the escape character for octal escapes
1252 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1253 && cc
== '\\' && (!buf
|| len
< n
))
1265 // invalid UTF-8 sequence
1270 unsigned ocnt
= cnt
- 1;
1271 wxUint32 res
= cc
& (0x3f >> cnt
);
1275 if ((cc
& 0xC0) != 0x80)
1277 // invalid UTF-8 sequence
1283 res
= (res
<< 6) | (cc
& 0x3f);
1286 if (invalid
|| res
<= utf8_max
[ocnt
])
1288 // illegal UTF-8 encoding
1291 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1292 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1294 // if one of our PUA characters turns up externally
1295 // it must also be treated as an illegal sequence
1296 // (a bit like you have to escape an escape character)
1302 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1303 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1304 if (pa
== wxCONV_FAILED
)
1316 *buf
++ = (wchar_t)res
;
1318 #endif // WC_UTF16/!WC_UTF16
1324 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1326 while (opsz
< psz
&& (!buf
|| len
< n
))
1329 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1330 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1331 wxASSERT(pa
!= wxCONV_FAILED
);
1338 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1344 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1346 while (opsz
< psz
&& (!buf
|| len
< n
))
1348 if ( buf
&& len
+ 3 < n
)
1350 unsigned char on
= *opsz
;
1352 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1353 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1354 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1361 else // MAP_INVALID_UTF8_NOT
1363 return wxCONV_FAILED
;
1369 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1375 static inline bool isoctal(wchar_t wch
)
1377 return L
'0' <= wch
&& wch
<= L
'7';
1380 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1381 const wchar_t *psz
, size_t srcLen
) const
1383 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1384 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1388 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1393 // cast is ok for WC_UTF16
1394 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1395 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1397 cc
= (*psz
++) & 0x7fffffff;
1400 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1401 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1404 *buf
++ = (char)(cc
- wxUnicodePUA
);
1407 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1408 && cc
== L
'\\' && psz
[0] == L
'\\' )
1415 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1417 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1421 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1422 (psz
[1] - L
'0') * 010 +
1432 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1448 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1450 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1456 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1462 // ============================================================================
1464 // ============================================================================
1466 #ifdef WORDS_BIGENDIAN
1467 #define wxMBConvUTF16straight wxMBConvUTF16BE
1468 #define wxMBConvUTF16swap wxMBConvUTF16LE
1470 #define wxMBConvUTF16swap wxMBConvUTF16BE
1471 #define wxMBConvUTF16straight wxMBConvUTF16LE
1475 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1477 if ( srcLen
== wxNO_LEN
)
1479 // count the number of bytes in input, including the trailing NULs
1480 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1481 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1484 srcLen
*= BYTES_PER_CHAR
;
1486 else // we already have the length
1488 // we can only convert an entire number of UTF-16 characters
1489 if ( srcLen
% BYTES_PER_CHAR
)
1490 return wxCONV_FAILED
;
1496 // case when in-memory representation is UTF-16 too
1499 // ----------------------------------------------------------------------------
1500 // conversions without endianness change
1501 // ----------------------------------------------------------------------------
1504 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1505 const char *src
, size_t srcLen
) const
1507 // set up the scene for using memcpy() (which is presumably more efficient
1508 // than copying the bytes one by one)
1509 srcLen
= GetLength(src
, srcLen
);
1510 if ( srcLen
== wxNO_LEN
)
1511 return wxCONV_FAILED
;
1513 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1516 if ( dstLen
< inLen
)
1517 return wxCONV_FAILED
;
1519 memcpy(dst
, src
, srcLen
);
1526 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1527 const wchar_t *src
, size_t srcLen
) const
1529 if ( srcLen
== wxNO_LEN
)
1530 srcLen
= wxWcslen(src
) + 1;
1532 srcLen
*= BYTES_PER_CHAR
;
1536 if ( dstLen
< srcLen
)
1537 return wxCONV_FAILED
;
1539 memcpy(dst
, src
, srcLen
);
1545 // ----------------------------------------------------------------------------
1546 // endian-reversing conversions
1547 // ----------------------------------------------------------------------------
1550 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1551 const char *src
, size_t srcLen
) const
1553 srcLen
= GetLength(src
, srcLen
);
1554 if ( srcLen
== wxNO_LEN
)
1555 return wxCONV_FAILED
;
1557 srcLen
/= BYTES_PER_CHAR
;
1561 if ( dstLen
< srcLen
)
1562 return wxCONV_FAILED
;
1564 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1565 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1567 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1575 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1576 const wchar_t *src
, size_t srcLen
) const
1578 if ( srcLen
== wxNO_LEN
)
1579 srcLen
= wxWcslen(src
) + 1;
1581 srcLen
*= BYTES_PER_CHAR
;
1585 if ( dstLen
< srcLen
)
1586 return wxCONV_FAILED
;
1588 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1589 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1591 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1598 #else // !WC_UTF16: wchar_t is UTF-32
1600 // ----------------------------------------------------------------------------
1601 // conversions without endianness change
1602 // ----------------------------------------------------------------------------
1605 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1606 const char *src
, size_t srcLen
) const
1608 srcLen
= GetLength(src
, srcLen
);
1609 if ( srcLen
== wxNO_LEN
)
1610 return wxCONV_FAILED
;
1612 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1615 // optimization: return maximal space which could be needed for this
1616 // string even if the real size could be smaller if the buffer contains
1622 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1623 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1625 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1627 return wxCONV_FAILED
;
1629 if ( ++outLen
> dstLen
)
1630 return wxCONV_FAILED
;
1640 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1641 const wchar_t *src
, size_t srcLen
) const
1643 if ( srcLen
== wxNO_LEN
)
1644 srcLen
= wxWcslen(src
) + 1;
1647 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1648 for ( size_t n
= 0; n
< srcLen
; n
++ )
1651 const size_t numChars
= encode_utf16(*src
++, cc
);
1652 if ( numChars
== wxCONV_FAILED
)
1653 return wxCONV_FAILED
;
1655 outLen
+= numChars
* BYTES_PER_CHAR
;
1658 if ( outLen
> dstLen
)
1659 return wxCONV_FAILED
;
1662 if ( numChars
== 2 )
1664 // second character of a surrogate
1673 // ----------------------------------------------------------------------------
1674 // endian-reversing conversions
1675 // ----------------------------------------------------------------------------
1678 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1679 const char *src
, size_t srcLen
) const
1681 srcLen
= GetLength(src
, srcLen
);
1682 if ( srcLen
== wxNO_LEN
)
1683 return wxCONV_FAILED
;
1685 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1688 // optimization: return maximal space which could be needed for this
1689 // string even if the real size could be smaller if the buffer contains
1695 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1696 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1701 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1703 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1705 const size_t numChars
= decode_utf16(tmp
, ch
);
1706 if ( numChars
== wxCONV_FAILED
)
1707 return wxCONV_FAILED
;
1709 if ( numChars
== 2 )
1712 if ( ++outLen
> dstLen
)
1713 return wxCONV_FAILED
;
1723 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1724 const wchar_t *src
, size_t srcLen
) const
1726 if ( srcLen
== wxNO_LEN
)
1727 srcLen
= wxWcslen(src
) + 1;
1730 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1731 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1734 const size_t numChars
= encode_utf16(*src
, cc
);
1735 if ( numChars
== wxCONV_FAILED
)
1736 return wxCONV_FAILED
;
1738 outLen
+= numChars
* BYTES_PER_CHAR
;
1741 if ( outLen
> dstLen
)
1742 return wxCONV_FAILED
;
1744 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1745 if ( numChars
== 2 )
1747 // second character of a surrogate
1748 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1756 #endif // WC_UTF16/!WC_UTF16
1759 // ============================================================================
1761 // ============================================================================
1763 #ifdef WORDS_BIGENDIAN
1764 #define wxMBConvUTF32straight wxMBConvUTF32BE
1765 #define wxMBConvUTF32swap wxMBConvUTF32LE
1767 #define wxMBConvUTF32swap wxMBConvUTF32BE
1768 #define wxMBConvUTF32straight wxMBConvUTF32LE
1772 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1773 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1776 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1778 if ( srcLen
== wxNO_LEN
)
1780 // count the number of bytes in input, including the trailing NULs
1781 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1782 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1785 srcLen
*= BYTES_PER_CHAR
;
1787 else // we already have the length
1789 // we can only convert an entire number of UTF-32 characters
1790 if ( srcLen
% BYTES_PER_CHAR
)
1791 return wxCONV_FAILED
;
1797 // case when in-memory representation is UTF-16
1800 // ----------------------------------------------------------------------------
1801 // conversions without endianness change
1802 // ----------------------------------------------------------------------------
1805 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1806 const char *src
, size_t srcLen
) const
1808 srcLen
= GetLength(src
, srcLen
);
1809 if ( srcLen
== wxNO_LEN
)
1810 return wxCONV_FAILED
;
1812 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1813 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1815 for ( size_t n
= 0; n
< inLen
; n
++ )
1818 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1819 if ( numChars
== wxCONV_FAILED
)
1820 return wxCONV_FAILED
;
1825 if ( outLen
> dstLen
)
1826 return wxCONV_FAILED
;
1829 if ( numChars
== 2 )
1831 // second character of a surrogate
1841 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1842 const wchar_t *src
, size_t srcLen
) const
1844 if ( srcLen
== wxNO_LEN
)
1845 srcLen
= wxWcslen(src
) + 1;
1849 // optimization: return maximal space which could be needed for this
1850 // string instead of the exact amount which could be less if there are
1851 // any surrogates in the input
1853 // we consider that surrogates are rare enough to make it worthwhile to
1854 // avoid running the loop below at the cost of slightly extra memory
1856 return srcLen
* BYTES_PER_CHAR
;
1859 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1861 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1863 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1865 return wxCONV_FAILED
;
1867 outLen
+= BYTES_PER_CHAR
;
1869 if ( outLen
> dstLen
)
1870 return wxCONV_FAILED
;
1878 // ----------------------------------------------------------------------------
1879 // endian-reversing conversions
1880 // ----------------------------------------------------------------------------
1883 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1884 const char *src
, size_t srcLen
) const
1886 srcLen
= GetLength(src
, srcLen
);
1887 if ( srcLen
== wxNO_LEN
)
1888 return wxCONV_FAILED
;
1890 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1891 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1893 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1896 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1897 if ( numChars
== wxCONV_FAILED
)
1898 return wxCONV_FAILED
;
1903 if ( outLen
> dstLen
)
1904 return wxCONV_FAILED
;
1907 if ( numChars
== 2 )
1909 // second character of a surrogate
1919 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1920 const wchar_t *src
, size_t srcLen
) const
1922 if ( srcLen
== wxNO_LEN
)
1923 srcLen
= wxWcslen(src
) + 1;
1927 // optimization: return maximal space which could be needed for this
1928 // string instead of the exact amount which could be less if there are
1929 // any surrogates in the input
1931 // we consider that surrogates are rare enough to make it worthwhile to
1932 // avoid running the loop below at the cost of slightly extra memory
1934 return srcLen
*BYTES_PER_CHAR
;
1937 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1939 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1941 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1943 return wxCONV_FAILED
;
1945 outLen
+= BYTES_PER_CHAR
;
1947 if ( outLen
> dstLen
)
1948 return wxCONV_FAILED
;
1950 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1956 #else // !WC_UTF16: wchar_t is UTF-32
1958 // ----------------------------------------------------------------------------
1959 // conversions without endianness change
1960 // ----------------------------------------------------------------------------
1963 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1964 const char *src
, size_t srcLen
) const
1966 // use memcpy() as it should be much faster than hand-written loop
1967 srcLen
= GetLength(src
, srcLen
);
1968 if ( srcLen
== wxNO_LEN
)
1969 return wxCONV_FAILED
;
1971 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1974 if ( dstLen
< inLen
)
1975 return wxCONV_FAILED
;
1977 memcpy(dst
, src
, srcLen
);
1984 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1985 const wchar_t *src
, size_t srcLen
) const
1987 if ( srcLen
== wxNO_LEN
)
1988 srcLen
= wxWcslen(src
) + 1;
1990 srcLen
*= BYTES_PER_CHAR
;
1994 if ( dstLen
< srcLen
)
1995 return wxCONV_FAILED
;
1997 memcpy(dst
, src
, srcLen
);
2003 // ----------------------------------------------------------------------------
2004 // endian-reversing conversions
2005 // ----------------------------------------------------------------------------
2008 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
2009 const char *src
, size_t srcLen
) const
2011 srcLen
= GetLength(src
, srcLen
);
2012 if ( srcLen
== wxNO_LEN
)
2013 return wxCONV_FAILED
;
2015 srcLen
/= BYTES_PER_CHAR
;
2019 if ( dstLen
< srcLen
)
2020 return wxCONV_FAILED
;
2022 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
2023 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
2025 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
2033 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
2034 const wchar_t *src
, size_t srcLen
) const
2036 if ( srcLen
== wxNO_LEN
)
2037 srcLen
= wxWcslen(src
) + 1;
2039 srcLen
*= BYTES_PER_CHAR
;
2043 if ( dstLen
< srcLen
)
2044 return wxCONV_FAILED
;
2046 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
2047 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
2049 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
2056 #endif // WC_UTF16/!WC_UTF16
2059 // ============================================================================
2060 // The classes doing conversion using the iconv_xxx() functions
2061 // ============================================================================
2065 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2066 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2067 // (unless there's yet another bug in glibc) the only case when iconv()
2068 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2069 // left in the input buffer -- when _real_ error occurs,
2070 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2072 // [This bug does not appear in glibc 2.2.]
2073 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2074 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2075 (errno != E2BIG || bufLeft != 0))
2077 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2080 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2082 #define ICONV_T_INVALID ((iconv_t)-1)
2084 #if SIZEOF_WCHAR_T == 4
2085 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2086 #define WC_ENC wxFONTENCODING_UTF32
2087 #elif SIZEOF_WCHAR_T == 2
2088 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2089 #define WC_ENC wxFONTENCODING_UTF16
2090 #else // sizeof(wchar_t) != 2 nor 4
2091 // does this ever happen?
2092 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2095 // ----------------------------------------------------------------------------
2096 // wxMBConv_iconv: encapsulates an iconv character set
2097 // ----------------------------------------------------------------------------
2099 class wxMBConv_iconv
: public wxMBConv
2102 wxMBConv_iconv(const char *name
);
2103 virtual ~wxMBConv_iconv();
2105 // implement base class virtual methods
2106 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
2107 const char *src
, size_t srcLen
= wxNO_LEN
) const;
2108 virtual size_t FromWChar(char *dst
, size_t dstLen
,
2109 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
2110 virtual size_t GetMBNulLen() const;
2112 #if wxUSE_UNICODE_UTF8
2113 virtual bool IsUTF8() const;
2116 virtual wxMBConv
*Clone() const
2118 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
2119 p
->m_minMBCharWidth
= m_minMBCharWidth
;
2124 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
2127 // the iconv handlers used to translate from multibyte
2128 // to wide char and in the other direction
2133 // guards access to m2w and w2m objects
2134 wxMutex m_iconvMutex
;
2138 // the name (for iconv_open()) of a wide char charset -- if none is
2139 // available on this machine, it will remain NULL
2140 static wxString ms_wcCharsetName
;
2142 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2143 // different endian-ness than the native one
2144 static bool ms_wcNeedsSwap
;
2147 // name of the encoding handled by this conversion
2150 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2152 size_t m_minMBCharWidth
;
2155 // make the constructor available for unit testing
2156 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2158 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2159 if ( !result
->IsOk() )
2168 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2169 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2171 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2174 m_minMBCharWidth
= 0;
2176 // check for charset that represents wchar_t:
2177 if ( ms_wcCharsetName
.empty() )
2179 wxLogTrace(TRACE_STRCONV
, wxT("Looking for wide char codeset:"));
2182 const wxChar
*const *names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2183 #else // !wxUSE_FONTMAP
2184 static const wxChar
*const names_static
[] =
2186 #if SIZEOF_WCHAR_T == 4
2188 #elif SIZEOF_WCHAR_T = 2
2193 const wxChar
*const *names
= names_static
;
2194 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2196 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2198 const wxString
nameCS(*names
);
2200 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2201 wxString
nameXE(nameCS
);
2203 #ifdef WORDS_BIGENDIAN
2204 nameXE
+= wxT("BE");
2205 #else // little endian
2206 nameXE
+= wxT("LE");
2209 wxLogTrace(TRACE_STRCONV
, wxT(" trying charset \"%s\""),
2212 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2213 if ( m2w
== ICONV_T_INVALID
)
2215 // try charset w/o bytesex info (e.g. "UCS4")
2216 wxLogTrace(TRACE_STRCONV
, wxT(" trying charset \"%s\""),
2218 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2220 // and check for bytesex ourselves:
2221 if ( m2w
!= ICONV_T_INVALID
)
2223 char buf
[2], *bufPtr
;
2232 outsz
= SIZEOF_WCHAR_T
* 2;
2233 char* wbufPtr
= (char*)wbuf
;
2237 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2240 if (ICONV_FAILED(res
, insz
))
2242 wxLogLastError(wxT("iconv"));
2243 wxLogError(_("Conversion to charset '%s' doesn't work."),
2246 else // ok, can convert to this encoding, remember it
2248 ms_wcCharsetName
= nameCS
;
2249 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2253 else // use charset not requiring byte swapping
2255 ms_wcCharsetName
= nameXE
;
2259 wxLogTrace(TRACE_STRCONV
,
2260 wxT("iconv wchar_t charset is \"%s\"%s"),
2261 ms_wcCharsetName
.empty() ? wxString("<none>")
2263 ms_wcNeedsSwap
? wxT(" (needs swap)")
2266 else // we already have ms_wcCharsetName
2268 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2271 if ( ms_wcCharsetName
.empty() )
2273 w2m
= ICONV_T_INVALID
;
2277 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2278 if ( w2m
== ICONV_T_INVALID
)
2280 wxLogTrace(TRACE_STRCONV
,
2281 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2282 ms_wcCharsetName
.c_str(), name
);
2287 wxMBConv_iconv::~wxMBConv_iconv()
2289 if ( m2w
!= ICONV_T_INVALID
)
2291 if ( w2m
!= ICONV_T_INVALID
)
2296 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2297 const char *src
, size_t srcLen
) const
2299 if ( srcLen
== wxNO_LEN
)
2301 // find the string length: notice that must be done differently for
2302 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2304 const size_t nulLen
= GetMBNulLen();
2308 return wxCONV_FAILED
;
2311 srcLen
= strlen(src
); // arguably more optimized than our version
2316 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2317 // but they also have to start at character boundary and not
2318 // span two adjacent characters
2320 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2326 // when we're determining the length of the string ourselves we count
2327 // the terminating NUL(s) as part of it and always NUL-terminate the
2332 // we express length in the number of (wide) characters but iconv always
2333 // counts buffer sizes it in bytes
2334 dstLen
*= SIZEOF_WCHAR_T
;
2337 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2338 // Unfortunately there are a couple of global wxCSConv objects such as
2339 // wxConvLocal that are used all over wx code, so we have to make sure
2340 // the handle is used by at most one thread at the time. Otherwise
2341 // only a few wx classes would be safe to use from non-main threads
2342 // as MB<->WC conversion would fail "randomly".
2343 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2344 #endif // wxUSE_THREADS
2347 const char *pszPtr
= src
;
2351 char* bufPtr
= (char*)dst
;
2353 // have destination buffer, convert there
2354 size_t dstLenOrig
= dstLen
;
2356 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2359 // convert the number of bytes converted as returned by iconv to the
2360 // number of (wide) characters converted that we need
2361 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2365 // convert to native endianness
2366 for ( unsigned i
= 0; i
< res
; i
++ )
2367 dst
[i
] = WC_BSWAP(dst
[i
]);
2370 else // no destination buffer
2372 // convert using temp buffer to calculate the size of the buffer needed
2378 char* bufPtr
= (char*)tbuf
;
2379 dstLen
= 8 * SIZEOF_WCHAR_T
;
2382 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2385 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2387 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2390 if (ICONV_FAILED(cres
, srcLen
))
2392 //VS: it is ok if iconv fails, hence trace only
2393 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2394 return wxCONV_FAILED
;
2400 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2401 const wchar_t *src
, size_t srcLen
) const
2404 // NB: explained in MB2WC
2405 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2408 if ( srcLen
== wxNO_LEN
)
2409 srcLen
= wxWcslen(src
) + 1;
2411 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2412 size_t outbuflen
= dstLen
;
2415 wchar_t *tmpbuf
= 0;
2419 // need to copy to temp buffer to switch endianness
2420 // (doing WC_BSWAP twice on the original buffer won't work, as it
2421 // could be in read-only memory, or be accessed in some other thread)
2422 tmpbuf
= (wchar_t *)malloc(inbuflen
);
2423 for ( size_t i
= 0; i
< srcLen
; i
++ )
2424 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2429 char* inbuf
= (char*)src
;
2432 // have destination buffer, convert there
2433 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2435 res
= dstLen
- outbuflen
;
2437 else // no destination buffer
2439 // convert using temp buffer to calculate the size of the buffer needed
2445 outbuflen
= WXSIZEOF(tbuf
);
2447 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2449 res
+= WXSIZEOF(tbuf
) - outbuflen
;
2451 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2459 if (ICONV_FAILED(cres
, inbuflen
))
2461 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2462 return wxCONV_FAILED
;
2468 size_t wxMBConv_iconv::GetMBNulLen() const
2470 if ( m_minMBCharWidth
== 0 )
2472 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2475 // NB: explained in MB2WC
2476 wxMutexLocker
lock(self
->m_iconvMutex
);
2479 const wchar_t *wnul
= L
"";
2480 char buf
[8]; // should be enough for NUL in any encoding
2481 size_t inLen
= sizeof(wchar_t),
2482 outLen
= WXSIZEOF(buf
);
2483 char *inBuff
= (char *)wnul
;
2484 char *outBuff
= buf
;
2485 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2487 self
->m_minMBCharWidth
= (size_t)-1;
2491 self
->m_minMBCharWidth
= outBuff
- buf
;
2495 return m_minMBCharWidth
;
2498 #if wxUSE_UNICODE_UTF8
2499 bool wxMBConv_iconv::IsUTF8() const
2501 return wxStricmp(m_name
, "UTF-8") == 0 ||
2502 wxStricmp(m_name
, "UTF8") == 0;
2506 #endif // HAVE_ICONV
2509 // ============================================================================
2510 // Win32 conversion classes
2511 // ============================================================================
2513 #ifdef wxHAVE_WIN32_MB2WC
2517 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2518 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2521 class wxMBConv_win32
: public wxMBConv
2526 m_CodePage
= CP_ACP
;
2527 m_minMBCharWidth
= 0;
2530 wxMBConv_win32(const wxMBConv_win32
& conv
)
2533 m_CodePage
= conv
.m_CodePage
;
2534 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2538 wxMBConv_win32(const char* name
)
2540 m_CodePage
= wxCharsetToCodepage(name
);
2541 m_minMBCharWidth
= 0;
2544 wxMBConv_win32(wxFontEncoding encoding
)
2546 m_CodePage
= wxEncodingToCodepage(encoding
);
2547 m_minMBCharWidth
= 0;
2549 #endif // wxUSE_FONTMAP
2551 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2553 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2554 // the behaviour is not compatible with the Unix version (using iconv)
2555 // and break the library itself, e.g. wxTextInputStream::NextChar()
2556 // wouldn't work if reading an incomplete MB char didn't result in an
2559 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2560 // Win XP or newer and it is not supported for UTF-[78] so we always
2561 // use our own conversions in this case. See
2562 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2563 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2564 if ( m_CodePage
== CP_UTF8
)
2566 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2569 if ( m_CodePage
== CP_UTF7
)
2571 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2575 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2576 IsAtLeastWin2kSP4() )
2578 flags
= MB_ERR_INVALID_CHARS
;
2581 const size_t len
= ::MultiByteToWideChar
2583 m_CodePage
, // code page
2584 flags
, // flags: fall on error
2585 psz
, // input string
2586 -1, // its length (NUL-terminated)
2587 buf
, // output string
2588 buf
? n
: 0 // size of output buffer
2592 // function totally failed
2593 return wxCONV_FAILED
;
2596 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2597 // check if we succeeded, by doing a double trip:
2598 if ( !flags
&& buf
)
2600 const size_t mbLen
= strlen(psz
);
2601 wxCharBuffer
mbBuf(mbLen
);
2602 if ( ::WideCharToMultiByte
2609 mbLen
+ 1, // size in bytes, not length
2613 strcmp(mbBuf
, psz
) != 0 )
2615 // we didn't obtain the same thing we started from, hence
2616 // the conversion was lossy and we consider that it failed
2617 return wxCONV_FAILED
;
2621 // note that it returns count of written chars for buf != NULL and size
2622 // of the needed buffer for buf == NULL so in either case the length of
2623 // the string (which never includes the terminating NUL) is one less
2627 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2630 we have a problem here: by default, WideCharToMultiByte() may
2631 replace characters unrepresentable in the target code page with bad
2632 quality approximations such as turning "1/2" symbol (U+00BD) into
2633 "1" for the code pages which don't have it and we, obviously, want
2634 to avoid this at any price
2636 the trouble is that this function does it _silently_, i.e. it won't
2637 even tell us whether it did or not... Win98/2000 and higher provide
2638 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2639 we have to resort to a round trip, i.e. check that converting back
2640 results in the same string -- this is, of course, expensive but
2641 otherwise we simply can't be sure to not garble the data.
2644 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2645 // it doesn't work with CJK encodings (which we test for rather roughly
2646 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2648 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2651 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2653 // it's our lucky day
2654 flags
= WC_NO_BEST_FIT_CHARS
;
2655 pUsedDef
= &usedDef
;
2657 else // old system or unsupported encoding
2663 const size_t len
= ::WideCharToMultiByte
2665 m_CodePage
, // code page
2666 flags
, // either none or no best fit
2667 pwz
, // input string
2668 -1, // it is (wide) NUL-terminated
2669 buf
, // output buffer
2670 buf
? n
: 0, // and its size
2671 NULL
, // default "replacement" char
2672 pUsedDef
// [out] was it used?
2677 // function totally failed
2678 return wxCONV_FAILED
;
2681 // we did something, check if we really succeeded
2684 // check if the conversion failed, i.e. if any replacements
2687 return wxCONV_FAILED
;
2689 else // we must resort to double tripping...
2691 // first we need to ensure that we really have the MB data: this is
2692 // not the case if we're called with NULL buffer, in which case we
2693 // need to do the conversion yet again
2694 wxCharBuffer bufDef
;
2697 bufDef
= wxCharBuffer(len
);
2698 buf
= bufDef
.data();
2699 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2700 buf
, len
, NULL
, NULL
) )
2701 return wxCONV_FAILED
;
2706 wxWCharBuffer
wcBuf(n
);
2707 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2708 wcscmp(wcBuf
, pwz
) != 0 )
2710 // we didn't obtain the same thing we started from, hence
2711 // the conversion was lossy and we consider that it failed
2712 return wxCONV_FAILED
;
2716 // see the comment above for the reason of "len - 1"
2720 virtual size_t GetMBNulLen() const
2722 if ( m_minMBCharWidth
== 0 )
2724 int len
= ::WideCharToMultiByte
2726 m_CodePage
, // code page
2728 L
"", // input string
2729 1, // translate just the NUL
2730 NULL
, // output buffer
2732 NULL
, // no replacement char
2733 NULL
// [out] don't care if it was used
2736 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2740 wxLogDebug(wxT("Unexpected NUL length %d"), len
);
2741 self
->m_minMBCharWidth
= (size_t)-1;
2745 self
->m_minMBCharWidth
= (size_t)-1;
2751 self
->m_minMBCharWidth
= len
;
2756 return m_minMBCharWidth
;
2759 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2761 bool IsOk() const { return m_CodePage
!= -1; }
2764 static bool CanUseNoBestFit()
2766 static int s_isWin98Or2k
= -1;
2768 if ( s_isWin98Or2k
== -1 )
2771 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2773 case wxOS_WINDOWS_9X
:
2774 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2777 case wxOS_WINDOWS_NT
:
2778 s_isWin98Or2k
= verMaj
>= 5;
2782 // unknown: be conservative by default
2787 wxASSERT_MSG( s_isWin98Or2k
!= -1, wxT("should be set above") );
2790 return s_isWin98Or2k
== 1;
2793 static bool IsAtLeastWin2kSP4()
2798 static int s_isAtLeastWin2kSP4
= -1;
2800 if ( s_isAtLeastWin2kSP4
== -1 )
2802 OSVERSIONINFOEX ver
;
2804 memset(&ver
, 0, sizeof(ver
));
2805 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2806 GetVersionEx((OSVERSIONINFO
*)&ver
);
2808 s_isAtLeastWin2kSP4
=
2809 ((ver
.dwMajorVersion
> 5) || // Vista+
2810 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2811 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2812 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2816 return s_isAtLeastWin2kSP4
== 1;
2821 // the code page we're working with
2824 // cached result of GetMBNulLen(), set to 0 initially meaning
2826 size_t m_minMBCharWidth
;
2829 #endif // wxHAVE_WIN32_MB2WC
2832 // ============================================================================
2833 // wxEncodingConverter based conversion classes
2834 // ============================================================================
2838 class wxMBConv_wxwin
: public wxMBConv
2843 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2844 // The wxMBConv_cf class does a better job.
2845 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2846 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2847 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2851 // temporarily just use wxEncodingConverter stuff,
2852 // so that it works while a better implementation is built
2853 wxMBConv_wxwin(const char* name
)
2856 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2858 m_enc
= wxFONTENCODING_SYSTEM
;
2863 wxMBConv_wxwin(wxFontEncoding enc
)
2870 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2872 size_t inbuf
= strlen(psz
);
2875 if (!m2w
.Convert(psz
, buf
))
2876 return wxCONV_FAILED
;
2881 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2883 const size_t inbuf
= wxWcslen(psz
);
2886 if (!w2m
.Convert(psz
, buf
))
2887 return wxCONV_FAILED
;
2893 virtual size_t GetMBNulLen() const
2897 case wxFONTENCODING_UTF16BE
:
2898 case wxFONTENCODING_UTF16LE
:
2901 case wxFONTENCODING_UTF32BE
:
2902 case wxFONTENCODING_UTF32LE
:
2910 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2912 bool IsOk() const { return m_ok
; }
2915 wxFontEncoding m_enc
;
2916 wxEncodingConverter m2w
, w2m
;
2919 // were we initialized successfully?
2922 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin
);
2925 // make the constructors available for unit testing
2926 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2928 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2929 if ( !result
->IsOk() )
2938 #endif // wxUSE_FONTMAP
2940 // ============================================================================
2941 // wxCSConv implementation
2942 // ============================================================================
2944 void wxCSConv::Init()
2951 wxCSConv::wxCSConv(const wxString
& charset
)
2955 if ( !charset
.empty() )
2957 SetName(charset
.ToAscii());
2961 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2962 if ( m_encoding
== wxFONTENCODING_MAX
)
2964 // set to unknown/invalid value
2965 m_encoding
= wxFONTENCODING_SYSTEM
;
2967 else if ( m_encoding
== wxFONTENCODING_DEFAULT
)
2969 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2970 m_encoding
= wxFONTENCODING_ISO8859_1
;
2973 m_encoding
= wxFONTENCODING_SYSTEM
;
2977 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2979 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2981 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2983 encoding
= wxFONTENCODING_SYSTEM
;
2988 m_encoding
= encoding
;
2991 wxCSConv::~wxCSConv()
2996 wxCSConv::wxCSConv(const wxCSConv
& conv
)
3001 SetName(conv
.m_name
);
3002 m_encoding
= conv
.m_encoding
;
3005 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
3009 SetName(conv
.m_name
);
3010 m_encoding
= conv
.m_encoding
;
3015 void wxCSConv::Clear()
3024 void wxCSConv::SetName(const char *charset
)
3028 m_name
= wxStrdup(charset
);
3035 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3036 wxEncodingNameCache
);
3038 static wxEncodingNameCache gs_nameCache
;
3041 wxMBConv
*wxCSConv::DoCreate() const
3044 wxLogTrace(TRACE_STRCONV
,
3045 wxT("creating conversion for %s"),
3047 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
3048 #endif // wxUSE_FONTMAP
3050 // check for the special case of ASCII or ISO8859-1 charset: as we have
3051 // special knowledge of it anyhow, we don't need to create a special
3052 // conversion object
3053 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
3054 m_encoding
== wxFONTENCODING_DEFAULT
)
3056 // don't convert at all
3060 // we trust OS to do conversion better than we can so try external
3061 // conversion methods first
3063 // the full order is:
3064 // 1. OS conversion (iconv() under Unix or Win32 API)
3065 // 2. hard coded conversions for UTF
3066 // 3. wxEncodingConverter as fall back
3072 #endif // !wxUSE_FONTMAP
3075 wxFontEncoding
encoding(m_encoding
);
3080 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
3088 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3089 #endif // wxUSE_FONTMAP
3093 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3094 if ( it
!= gs_nameCache
.end() )
3096 if ( it
->second
.empty() )
3099 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
3106 const wxChar
* const* names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3107 // CS : in case this does not return valid names (eg for MacRoman)
3108 // encoding got a 'failure' entry in the cache all the same,
3109 // although it just has to be created using a different method, so
3110 // only store failed iconv creation attempts (or perhaps we
3111 // shoulnd't do this at all ?)
3112 if ( names
[0] != NULL
)
3114 for ( ; *names
; ++names
)
3116 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3117 // will need changes that will obsolete this
3118 wxString
name(*names
);
3119 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
3122 gs_nameCache
[encoding
] = *names
;
3129 gs_nameCache
[encoding
] = wxT(""); // cache the failure
3132 #endif // wxUSE_FONTMAP
3134 #endif // HAVE_ICONV
3136 #ifdef wxHAVE_WIN32_MB2WC
3139 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3140 : new wxMBConv_win32(m_encoding
);
3149 #endif // wxHAVE_WIN32_MB2WC
3153 // leave UTF16 and UTF32 to the built-ins of wx
3154 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3155 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3158 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3159 : new wxMBConv_cf(m_encoding
);
3161 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3170 #endif // __DARWIN__
3173 wxFontEncoding enc
= m_encoding
;
3175 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3177 // use "false" to suppress interactive dialogs -- we can be called from
3178 // anywhere and popping up a dialog from here is the last thing we want to
3180 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3182 #endif // wxUSE_FONTMAP
3186 case wxFONTENCODING_UTF7
:
3187 return new wxMBConvUTF7
;
3189 case wxFONTENCODING_UTF8
:
3190 return new wxMBConvUTF8
;
3192 case wxFONTENCODING_UTF16BE
:
3193 return new wxMBConvUTF16BE
;
3195 case wxFONTENCODING_UTF16LE
:
3196 return new wxMBConvUTF16LE
;
3198 case wxFONTENCODING_UTF32BE
:
3199 return new wxMBConvUTF32BE
;
3201 case wxFONTENCODING_UTF32LE
:
3202 return new wxMBConvUTF32LE
;
3205 // nothing to do but put here to suppress gcc warnings
3212 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3213 : new wxMBConv_wxwin(m_encoding
);
3220 wxLogTrace(TRACE_STRCONV
,
3221 wxT("encoding \"%s\" is not supported by this system"),
3222 (m_name
? wxString(m_name
)
3223 : wxFontMapperBase::GetEncodingName(m_encoding
)));
3224 #endif // wxUSE_FONTMAP
3229 void wxCSConv::CreateConvIfNeeded() const
3233 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3235 // if we don't have neither the name nor the encoding, use the default
3236 // encoding for this system
3237 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3240 self
->m_encoding
= wxLocale::GetSystemEncoding();
3242 // fallback to some reasonable default:
3243 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3244 #endif // wxUSE_INTL
3247 self
->m_convReal
= DoCreate();
3248 self
->m_deferred
= false;
3252 bool wxCSConv::IsOk() const
3254 CreateConvIfNeeded();
3256 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3257 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3258 return true; // always ok as we do it ourselves
3260 // m_convReal->IsOk() is called at its own creation, so we know it must
3261 // be ok if m_convReal is non-NULL
3262 return m_convReal
!= NULL
;
3265 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3266 const char *src
, size_t srcLen
) const
3268 CreateConvIfNeeded();
3271 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3274 if ( srcLen
== wxNO_LEN
)
3275 srcLen
= strlen(src
) + 1; // take trailing NUL too
3279 if ( dstLen
< srcLen
)
3280 return wxCONV_FAILED
;
3282 for ( size_t n
= 0; n
< srcLen
; n
++ )
3283 dst
[n
] = (unsigned char)(src
[n
]);
3289 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3290 const wchar_t *src
, size_t srcLen
) const
3292 CreateConvIfNeeded();
3295 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3298 if ( srcLen
== wxNO_LEN
)
3299 srcLen
= wxWcslen(src
) + 1;
3303 if ( dstLen
< srcLen
)
3304 return wxCONV_FAILED
;
3306 for ( size_t n
= 0; n
< srcLen
; n
++ )
3308 if ( src
[n
] > 0xFF )
3309 return wxCONV_FAILED
;
3311 dst
[n
] = (char)src
[n
];
3315 else // still need to check the input validity
3317 for ( size_t n
= 0; n
< srcLen
; n
++ )
3319 if ( src
[n
] > 0xFF )
3320 return wxCONV_FAILED
;
3327 size_t wxCSConv::GetMBNulLen() const
3329 CreateConvIfNeeded();
3333 return m_convReal
->GetMBNulLen();
3336 // otherwise, we are ISO-8859-1
3340 #if wxUSE_UNICODE_UTF8
3341 bool wxCSConv::IsUTF8() const
3343 CreateConvIfNeeded();
3347 return m_convReal
->IsUTF8();
3350 // otherwise, we are ISO-8859-1
3358 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3361 return wxWCharBuffer();
3363 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3365 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3367 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3372 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3375 return wxCharBuffer();
3377 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3379 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3384 #endif // wxUSE_UNICODE
3386 // ----------------------------------------------------------------------------
3388 // ----------------------------------------------------------------------------
3390 // NB: The reason why we create converted objects in this convoluted way,
3391 // using a factory function instead of global variable, is that they
3392 // may be used at static initialization time (some of them are used by
3393 // wxString ctors and there may be a global wxString object). In other
3394 // words, possibly _before_ the converter global object would be
3401 #undef wxConvISO8859_1
3403 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3404 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3405 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3407 static impl_klass name##Obj ctor_args; \
3408 return &name##Obj; \
3410 /* this ensures that all global converter objects are created */ \
3411 /* by the time static initialization is done, i.e. before any */ \
3412 /* thread is launched: */ \
3413 static klass* gs_##name##instance = wxGet_##name##Ptr()
3415 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3416 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3419 // disable warning "variable 'xxx' was declared but never referenced"
3420 #pragma warning(disable: 177)
3424 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3425 #elif 0 // defined(__WXOSX__)
3426 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_cf
, wxConvLibc
, (wxFONTENCODING_UTF8
));
3428 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3431 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3432 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3433 // provokes an error message about "not enough macro parameters"; and we
3434 // can't use "()" here as the name##Obj declaration would be parsed as a
3435 // function declaration then, so use a semicolon and live with an extra
3436 // empty statement (and hope that no compilers warns about this)
3437 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3438 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3440 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3441 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3443 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3444 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3447 // The xnu kernel always communicates file paths in decomposed UTF-8.
3448 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3449 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3452 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3455 #else // !__DARWIN__
3456 wxGet_wxConvLibcPtr();
3457 #endif // __DARWIN__/!__DARWIN__
3459 #else // !wxUSE_WCHAR_T
3461 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3462 // stand-ins in absence of wchar_t
3463 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3468 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T