1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV wxT("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(reinterpret_cast<const wxUint16
*>(*pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten
= 0;
176 // the number of NULs terminating this string
177 size_t nulLen
= 0; // not really needed, but just to avoid warnings
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
185 if ( srcLen
!= wxNO_LEN
)
187 // we need to know how to find the end of this string
188 nulLen
= GetMBNulLen();
189 if ( nulLen
== wxCONV_FAILED
)
190 return wxCONV_FAILED
;
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
197 char * const p
= bufTmp
.data();
198 memcpy(p
, src
, srcLen
);
199 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
205 srcEnd
= src
+ srcLen
;
207 else // quit after the first loop iteration
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd != NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
229 // try to convert the current chunk
230 size_t lenChunk
= MB2WC(NULL
, src
, 0);
231 if ( lenChunk
== wxCONV_FAILED
)
232 return wxCONV_FAILED
;
234 dstWritten
+= lenChunk
;
240 // nothing left in the input string, conversion succeeded
246 if ( dstWritten
> dstLen
)
247 return wxCONV_FAILED
;
249 // +1 is for trailing NUL
250 if ( MB2WC(dst
, src
, lenChunk
+ 1) == wxCONV_FAILED
)
251 return wxCONV_FAILED
;
260 // we convert just one chunk in this case as this is the entire
261 // string anyhow (and we don't count the trailing NUL in this case)
265 // advance the input pointer past the end of this chunk: notice that we
266 // will always stop before srcEnd because we know that the chunk is
267 // always properly NUL-terminated
268 while ( NotAllNULs(src
, nulLen
) )
270 // notice that we must skip over multiple bytes here as we suppose
271 // that if NUL takes 2 or 4 bytes, then all the other characters do
272 // too and so if advanced by a single byte we might erroneously
273 // detect sequences of NUL bytes in the middle of the input
277 // if the buffer ends before this NUL, we shouldn't count it in our
278 // output so skip the code below
282 // do count this terminator as it's inside the buffer we convert
287 src
+= nulLen
; // skip the terminator itself
297 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
298 const wchar_t *src
, size_t srcLen
) const
300 // the number of chars [which would be] written to dst [if it were not NULL]
301 size_t dstWritten
= 0;
303 // if we don't know its length we have no choice but to assume that it is
304 // NUL-terminated (notice that it can still be NUL-terminated even if
305 // explicit length is given but it doesn't change our return value)
306 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
308 // make a copy of the input string unless it is already properly
310 wxWCharBuffer bufTmp
;
311 if ( isNulTerminated
)
313 srcLen
= wxWcslen(src
) + 1;
315 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
317 // make a copy in order to properly NUL-terminate the string
318 bufTmp
= wxWCharBuffer(srcLen
);
319 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
323 const size_t lenNul
= GetMBNulLen();
324 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
326 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
328 // try to convert the current chunk
329 size_t lenChunk
= WC2MB(NULL
, src
, 0);
331 if ( lenChunk
== wxCONV_FAILED
)
332 return wxCONV_FAILED
;
334 dstWritten
+= lenChunk
;
335 if ( src
+ lenChunk
< srcEnd
|| isNulTerminated
)
336 dstWritten
+= lenNul
;
340 if ( dstWritten
> dstLen
)
341 return wxCONV_FAILED
;
343 if ( WC2MB(dst
, src
, lenChunk
+ lenNul
) == wxCONV_FAILED
)
344 return wxCONV_FAILED
;
347 if ( src
+ lenChunk
< srcEnd
|| isNulTerminated
)
355 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
357 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
358 if ( rc
!= wxCONV_FAILED
)
360 // ToWChar() returns the buffer length, i.e. including the trailing
361 // NUL, while this method doesn't take it into account
368 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
370 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
371 if ( rc
!= wxCONV_FAILED
)
379 wxMBConv::~wxMBConv()
381 // nothing to do here (necessary for Darwin linking probably)
384 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
388 // calculate the length of the buffer needed first
389 const size_t nLen
= ToWChar(NULL
, 0, psz
);
390 if ( nLen
!= wxCONV_FAILED
)
392 // now do the actual conversion
393 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
395 // +1 for the trailing NULL
396 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
401 return wxWCharBuffer();
404 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
408 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
409 if ( nLen
!= wxCONV_FAILED
)
411 wxCharBuffer
buf(nLen
- 1);
412 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
417 return wxCharBuffer();
421 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
423 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
424 if ( dstLen
!= wxCONV_FAILED
)
426 // notice that we allocate space for dstLen+1 wide characters here
427 // because we want the buffer to always be NUL-terminated, even if the
428 // input isn't (as otherwise the caller has no way to know its length)
429 wxWCharBuffer
wbuf(dstLen
);
430 wbuf
.data()[dstLen
] = L
'\0';
431 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
437 // we also need to handle NUL-terminated input strings
438 // specially: for them the output is the length of the string
439 // excluding the trailing NUL, however if we're asked to
440 // convert a specific number of characters we return the length
441 // of the resulting output even if it's NUL-terminated
442 if ( inLen
== wxNO_LEN
)
453 return wxWCharBuffer();
457 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
459 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
460 if ( dstLen
!= wxCONV_FAILED
)
462 const size_t nulLen
= GetMBNulLen();
464 // as above, ensure that the buffer is always NUL-terminated, even if
466 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
467 memset(buf
.data() + dstLen
, 0, nulLen
);
468 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
474 if ( inLen
== wxNO_LEN
)
476 // in this case both input and output are NUL-terminated
477 // and we're not supposed to count NUL
489 return wxCharBuffer();
492 const wxWCharBuffer
wxMBConv::cMB2WC(const wxScopedCharBuffer
& buf
) const
494 const size_t srcLen
= buf
.length();
497 const size_t dstLen
= ToWChar(NULL
, 0, buf
, srcLen
);
498 if ( dstLen
!= wxCONV_FAILED
)
500 wxWCharBuffer
wbuf(dstLen
);
501 wbuf
.data()[dstLen
] = L
'\0';
502 if ( ToWChar(wbuf
.data(), dstLen
, buf
, srcLen
) != wxCONV_FAILED
)
507 return wxWCharBuffer();
510 const wxCharBuffer
wxMBConv::cWC2MB(const wxScopedWCharBuffer
& wbuf
) const
512 const size_t srcLen
= wbuf
.length();
515 const size_t dstLen
= FromWChar(NULL
, 0, wbuf
, srcLen
);
516 if ( dstLen
!= wxCONV_FAILED
)
518 wxCharBuffer
buf(dstLen
);
519 buf
.data()[dstLen
] = '\0';
520 if ( FromWChar(buf
.data(), dstLen
, wbuf
, srcLen
) != wxCONV_FAILED
)
525 return wxCharBuffer();
528 // ----------------------------------------------------------------------------
530 // ----------------------------------------------------------------------------
532 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
534 return wxMB2WC(buf
, psz
, n
);
537 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
539 return wxWC2MB(buf
, psz
, n
);
542 // ----------------------------------------------------------------------------
543 // wxConvBrokenFileNames
544 // ----------------------------------------------------------------------------
548 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
550 if ( wxStricmp(charset
, wxT("UTF-8")) == 0 ||
551 wxStricmp(charset
, wxT("UTF8")) == 0 )
552 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
554 m_conv
= new wxCSConv(charset
);
559 // ----------------------------------------------------------------------------
561 // ----------------------------------------------------------------------------
563 // Implementation (C) 2004 Fredrik Roubert
565 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
568 // BASE64 decoding table
570 static const unsigned char utf7unb64
[] =
572 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
573 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
574 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
575 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
576 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
577 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
578 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
579 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
580 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
581 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
582 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
583 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
584 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
585 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
586 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
587 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
588 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
589 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
590 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
591 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
592 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
593 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
594 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
595 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
596 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
597 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
598 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
599 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
600 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
601 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
602 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
603 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
606 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
607 const char *src
, size_t srcLen
) const
609 DecoderState stateOrig
,
611 if ( srcLen
== wxNO_LEN
)
613 // convert the entire string, up to and including the trailing NUL
614 srcLen
= strlen(src
) + 1;
616 // when working on the entire strings we don't update nor use the shift
617 // state from the previous call
618 statePtr
= &stateOrig
;
620 else // when working with partial strings we do use the shift state
622 statePtr
= const_cast<DecoderState
*>(&m_stateDecoder
);
624 // also save the old state to be able to rollback to it on error
625 stateOrig
= m_stateDecoder
;
628 // but to simplify the code below we use this variable in both cases
629 DecoderState
& state
= *statePtr
;
632 // number of characters [which would have been] written to dst [if it were
636 const char * const srcEnd
= src
+ srcLen
;
638 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
640 const unsigned char cc
= *src
++;
642 if ( state
.IsShifted() )
644 const unsigned char dc
= utf7unb64
[cc
];
647 // end of encoded part, check that nothing was left: there can
648 // be up to 4 bits of 0 padding but nothing else (we also need
649 // to check isLSB as we count bits modulo 8 while a valid UTF-7
650 // encoded sequence must contain an integral number of UTF-16
652 if ( state
.isLSB
|| state
.bit
> 4 ||
653 (state
.accum
& ((1 << state
.bit
) - 1)) )
658 return wxCONV_FAILED
;
663 // re-parse this character normally below unless it's '-' which
664 // is consumed by the decoder
668 else // valid encoded character
670 // mini base64 decoder: each character is 6 bits
675 if ( state
.bit
>= 8 )
677 // got the full byte, consume it
679 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
683 // we've got the full word, output it
685 *dst
++ = (state
.msb
<< 8) | b
;
691 // just store it while we wait for LSB
699 if ( state
.IsDirect() )
701 // start of an encoded segment?
706 // just the encoded plus sign, don't switch to shifted mode
712 else if ( utf7unb64
[(unsigned)*src
] == 0xff )
714 // empty encoded chunks are not allowed
718 return wxCONV_FAILED
;
720 else // base-64 encoded chunk follows
727 // only printable 7 bit ASCII characters (with the exception of
728 // NUL, TAB, CR and LF) can be used directly
729 if ( cc
>= 0x7f || (cc
< ' ' &&
730 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
731 return wxCONV_FAILED
;
742 // as we didn't read any characters we should be called with the same
743 // data (followed by some more new data) again later so don't save our
747 return wxCONV_FAILED
;
754 // BASE64 encoding table
756 static const unsigned char utf7enb64
[] =
758 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
759 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
760 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
761 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
762 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
763 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
764 'w', 'x', 'y', 'z', '0', '1', '2', '3',
765 '4', '5', '6', '7', '8', '9', '+', '/'
769 // UTF-7 encoding table
771 // 0 - Set D (directly encoded characters)
772 // 1 - Set O (optional direct characters)
773 // 2 - whitespace characters (optional)
774 // 3 - special characters
776 static const unsigned char utf7encode
[128] =
778 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
779 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
780 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
781 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
782 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
783 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
784 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
788 static inline bool wxIsUTF7Direct(wchar_t wc
)
790 return wc
< 0x80 && utf7encode
[wc
] < 1;
793 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
794 const wchar_t *src
, size_t srcLen
) const
796 EncoderState stateOrig
,
798 if ( srcLen
== wxNO_LEN
)
800 // we don't apply the stored state when operating on entire strings at
802 statePtr
= &stateOrig
;
804 srcLen
= wxWcslen(src
) + 1;
806 else // do use the mode we left the output in previously
808 stateOrig
= m_stateEncoder
;
809 statePtr
= const_cast<EncoderState
*>(&m_stateEncoder
);
812 EncoderState
& state
= *statePtr
;
817 const wchar_t * const srcEnd
= src
+ srcLen
;
818 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
821 if ( wxIsUTF7Direct(cc
) )
823 if ( state
.IsShifted() )
825 // pad with zeros the last encoded block if necessary
829 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
844 else if ( cc
== '+' && state
.IsDirect() )
855 else if (((wxUint32
)cc
) > 0xffff)
857 // no surrogate pair generation (yet?)
858 return wxCONV_FAILED
;
863 if ( state
.IsDirect() )
872 // BASE64 encode string
875 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
878 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
880 for (state
.bit
+= 8; state
.bit
>= 6; )
884 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
889 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
897 // we need to restore the original encoder state if we were called just to
898 // calculate the amount of space needed as we will presumably be called
899 // again to really convert the data now
906 // ----------------------------------------------------------------------------
908 // ----------------------------------------------------------------------------
910 static const wxUint32 utf8_max
[]=
911 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
913 // boundaries of the private use area we use to (temporarily) remap invalid
914 // characters invalid in a UTF-8 encoded string
915 const wxUint32 wxUnicodePUA
= 0x100000;
916 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
918 // this table gives the length of the UTF-8 encoding from its first character:
919 const unsigned char tableUtf8Lengths
[256] = {
920 // single-byte sequences (ASCII):
921 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
922 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
923 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
924 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
925 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
926 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
927 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
928 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
930 // these are invalid:
931 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
932 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
933 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
934 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
937 // two-byte sequences:
938 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
939 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
941 // three-byte sequences:
942 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
944 // four-byte sequences:
945 4, 4, 4, 4, 4, // F0..F4
947 // these are invalid again (5- or 6-byte
948 // sequences and sequences for code points
949 // above U+10FFFF, as restricted by RFC 3629):
950 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
954 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
955 const char *src
, size_t srcLen
) const
957 wchar_t *out
= dstLen
? dst
: NULL
;
960 if ( srcLen
== wxNO_LEN
)
961 srcLen
= strlen(src
) + 1;
963 for ( const char *p
= src
; ; p
++ )
965 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
967 // all done successfully, just add the trailing NULL if we are not
968 // using explicit length
969 if ( srcLen
== wxNO_LEN
)
985 if ( out
&& !dstLen
-- )
989 unsigned char c
= *p
;
993 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
996 if ( srcLen
!= wxNO_LEN
)
1003 unsigned len
= tableUtf8Lengths
[c
];
1007 if ( srcLen
< len
) // the test works for wxNO_LEN too
1010 if ( srcLen
!= wxNO_LEN
)
1013 // Char. number range | UTF-8 octet sequence
1014 // (hexadecimal) | (binary)
1015 // ----------------------+----------------------------------------
1016 // 0000 0000 - 0000 007F | 0xxxxxxx
1017 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1018 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1019 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1021 // Code point value is stored in bits marked with 'x',
1022 // lowest-order bit of the value on the right side in the diagram
1023 // above. (from RFC 3629)
1025 // mask to extract lead byte's value ('x' bits above), by sequence
1027 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1029 // mask and value of lead byte's most significant bits, by length:
1030 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1031 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1033 len
--; // it's more convenient to work with 0-based length here
1035 // extract the lead byte's value bits:
1036 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
1039 code
= c
& leadValueMask
[len
];
1041 // all remaining bytes, if any, are handled in the same way
1042 // regardless of sequence's length:
1043 for ( ; len
; --len
)
1046 if ( (c
& 0xC0) != 0x80 )
1047 return wxCONV_FAILED
;
1055 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1056 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
1065 #endif // WC_UTF16/!WC_UTF16
1073 return wxCONV_FAILED
;
1077 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
1078 const wchar_t *src
, size_t srcLen
) const
1080 char *out
= dstLen
? dst
: NULL
;
1083 for ( const wchar_t *wp
= src
; ; wp
++ )
1085 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
) )
1087 // all done successfully, just add the trailing NULL if we are not
1088 // using explicit length
1089 if ( srcLen
== wxNO_LEN
)
1105 if ( srcLen
!= wxNO_LEN
)
1110 // cast is ok for WC_UTF16
1111 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1113 // skip the next char too as we decoded a surrogate
1116 #else // wchar_t is UTF-32
1117 code
= *wp
& 0x7fffffff;
1129 out
[0] = (char)code
;
1132 else if ( code
<= 0x07FF )
1140 // NB: this line takes 6 least significant bits, encodes them as
1141 // 10xxxxxx and discards them so that the next byte can be encoded:
1142 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1143 out
[0] = 0xC0 | code
;
1146 else if ( code
< 0xFFFF )
1154 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1155 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1156 out
[0] = 0xE0 | code
;
1159 else if ( code
<= 0x10FFFF )
1167 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1168 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1169 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1170 out
[0] = 0xF0 | code
;
1175 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1188 // we only get here if an error occurs during decoding
1189 return wxCONV_FAILED
;
1192 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1193 const char *psz
, size_t srcLen
) const
1195 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1196 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1200 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1202 const char *opsz
= psz
;
1203 bool invalid
= false;
1204 unsigned char cc
= *psz
++, fc
= cc
;
1206 for (cnt
= 0; fc
& 0x80; cnt
++)
1216 // escape the escape character for octal escapes
1217 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1218 && cc
== '\\' && (!buf
|| len
< n
))
1230 // invalid UTF-8 sequence
1235 unsigned ocnt
= cnt
- 1;
1236 wxUint32 res
= cc
& (0x3f >> cnt
);
1240 if ((cc
& 0xC0) != 0x80)
1242 // invalid UTF-8 sequence
1248 res
= (res
<< 6) | (cc
& 0x3f);
1251 if (invalid
|| res
<= utf8_max
[ocnt
])
1253 // illegal UTF-8 encoding
1256 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1257 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1259 // if one of our PUA characters turns up externally
1260 // it must also be treated as an illegal sequence
1261 // (a bit like you have to escape an escape character)
1267 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1268 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1269 if (pa
== wxCONV_FAILED
)
1281 *buf
++ = (wchar_t)res
;
1283 #endif // WC_UTF16/!WC_UTF16
1289 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1291 while (opsz
< psz
&& (!buf
|| len
< n
))
1294 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1295 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1296 wxASSERT(pa
!= wxCONV_FAILED
);
1303 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1309 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1311 while (opsz
< psz
&& (!buf
|| len
< n
))
1313 if ( buf
&& len
+ 3 < n
)
1315 unsigned char on
= *opsz
;
1317 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1318 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1319 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1326 else // MAP_INVALID_UTF8_NOT
1328 return wxCONV_FAILED
;
1334 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1340 static inline bool isoctal(wchar_t wch
)
1342 return L
'0' <= wch
&& wch
<= L
'7';
1345 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1346 const wchar_t *psz
, size_t srcLen
) const
1348 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1349 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1353 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1358 // cast is ok for WC_UTF16
1359 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1360 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1362 cc
= (*psz
++) & 0x7fffffff;
1365 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1366 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1369 *buf
++ = (char)(cc
- wxUnicodePUA
);
1372 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1373 && cc
== L
'\\' && psz
[0] == L
'\\' )
1380 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1382 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1386 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1387 (psz
[1] - L
'0') * 010 +
1397 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1413 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1415 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1421 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1427 // ============================================================================
1429 // ============================================================================
1431 #ifdef WORDS_BIGENDIAN
1432 #define wxMBConvUTF16straight wxMBConvUTF16BE
1433 #define wxMBConvUTF16swap wxMBConvUTF16LE
1435 #define wxMBConvUTF16swap wxMBConvUTF16BE
1436 #define wxMBConvUTF16straight wxMBConvUTF16LE
1440 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1442 if ( srcLen
== wxNO_LEN
)
1444 // count the number of bytes in input, including the trailing NULs
1445 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1446 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1449 srcLen
*= BYTES_PER_CHAR
;
1451 else // we already have the length
1453 // we can only convert an entire number of UTF-16 characters
1454 if ( srcLen
% BYTES_PER_CHAR
)
1455 return wxCONV_FAILED
;
1461 // case when in-memory representation is UTF-16 too
1464 // ----------------------------------------------------------------------------
1465 // conversions without endianness change
1466 // ----------------------------------------------------------------------------
1469 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1470 const char *src
, size_t srcLen
) const
1472 // set up the scene for using memcpy() (which is presumably more efficient
1473 // than copying the bytes one by one)
1474 srcLen
= GetLength(src
, srcLen
);
1475 if ( srcLen
== wxNO_LEN
)
1476 return wxCONV_FAILED
;
1478 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1481 if ( dstLen
< inLen
)
1482 return wxCONV_FAILED
;
1484 memcpy(dst
, src
, srcLen
);
1491 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1492 const wchar_t *src
, size_t srcLen
) const
1494 if ( srcLen
== wxNO_LEN
)
1495 srcLen
= wxWcslen(src
) + 1;
1497 srcLen
*= BYTES_PER_CHAR
;
1501 if ( dstLen
< srcLen
)
1502 return wxCONV_FAILED
;
1504 memcpy(dst
, src
, srcLen
);
1510 // ----------------------------------------------------------------------------
1511 // endian-reversing conversions
1512 // ----------------------------------------------------------------------------
1515 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1516 const char *src
, size_t srcLen
) const
1518 srcLen
= GetLength(src
, srcLen
);
1519 if ( srcLen
== wxNO_LEN
)
1520 return wxCONV_FAILED
;
1522 srcLen
/= BYTES_PER_CHAR
;
1526 if ( dstLen
< srcLen
)
1527 return wxCONV_FAILED
;
1529 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1530 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1532 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1540 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1541 const wchar_t *src
, size_t srcLen
) const
1543 if ( srcLen
== wxNO_LEN
)
1544 srcLen
= wxWcslen(src
) + 1;
1546 srcLen
*= BYTES_PER_CHAR
;
1550 if ( dstLen
< srcLen
)
1551 return wxCONV_FAILED
;
1553 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1554 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1556 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1563 #else // !WC_UTF16: wchar_t is UTF-32
1565 // ----------------------------------------------------------------------------
1566 // conversions without endianness change
1567 // ----------------------------------------------------------------------------
1570 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1571 const char *src
, size_t srcLen
) const
1573 srcLen
= GetLength(src
, srcLen
);
1574 if ( srcLen
== wxNO_LEN
)
1575 return wxCONV_FAILED
;
1577 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1580 // optimization: return maximal space which could be needed for this
1581 // string even if the real size could be smaller if the buffer contains
1587 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1588 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1590 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1592 return wxCONV_FAILED
;
1594 if ( ++outLen
> dstLen
)
1595 return wxCONV_FAILED
;
1605 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1606 const wchar_t *src
, size_t srcLen
) const
1608 if ( srcLen
== wxNO_LEN
)
1609 srcLen
= wxWcslen(src
) + 1;
1612 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1613 for ( size_t n
= 0; n
< srcLen
; n
++ )
1616 const size_t numChars
= encode_utf16(*src
++, cc
);
1617 if ( numChars
== wxCONV_FAILED
)
1618 return wxCONV_FAILED
;
1620 outLen
+= numChars
* BYTES_PER_CHAR
;
1623 if ( outLen
> dstLen
)
1624 return wxCONV_FAILED
;
1627 if ( numChars
== 2 )
1629 // second character of a surrogate
1638 // ----------------------------------------------------------------------------
1639 // endian-reversing conversions
1640 // ----------------------------------------------------------------------------
1643 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1644 const char *src
, size_t srcLen
) const
1646 srcLen
= GetLength(src
, srcLen
);
1647 if ( srcLen
== wxNO_LEN
)
1648 return wxCONV_FAILED
;
1650 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1653 // optimization: return maximal space which could be needed for this
1654 // string even if the real size could be smaller if the buffer contains
1660 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1661 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1666 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1668 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1670 const size_t numChars
= decode_utf16(tmp
, ch
);
1671 if ( numChars
== wxCONV_FAILED
)
1672 return wxCONV_FAILED
;
1674 if ( numChars
== 2 )
1677 if ( ++outLen
> dstLen
)
1678 return wxCONV_FAILED
;
1688 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1689 const wchar_t *src
, size_t srcLen
) const
1691 if ( srcLen
== wxNO_LEN
)
1692 srcLen
= wxWcslen(src
) + 1;
1695 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1696 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1699 const size_t numChars
= encode_utf16(*src
, cc
);
1700 if ( numChars
== wxCONV_FAILED
)
1701 return wxCONV_FAILED
;
1703 outLen
+= numChars
* BYTES_PER_CHAR
;
1706 if ( outLen
> dstLen
)
1707 return wxCONV_FAILED
;
1709 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1710 if ( numChars
== 2 )
1712 // second character of a surrogate
1713 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1721 #endif // WC_UTF16/!WC_UTF16
1724 // ============================================================================
1726 // ============================================================================
1728 #ifdef WORDS_BIGENDIAN
1729 #define wxMBConvUTF32straight wxMBConvUTF32BE
1730 #define wxMBConvUTF32swap wxMBConvUTF32LE
1732 #define wxMBConvUTF32swap wxMBConvUTF32BE
1733 #define wxMBConvUTF32straight wxMBConvUTF32LE
1737 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1738 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1741 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1743 if ( srcLen
== wxNO_LEN
)
1745 // count the number of bytes in input, including the trailing NULs
1746 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1747 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1750 srcLen
*= BYTES_PER_CHAR
;
1752 else // we already have the length
1754 // we can only convert an entire number of UTF-32 characters
1755 if ( srcLen
% BYTES_PER_CHAR
)
1756 return wxCONV_FAILED
;
1762 // case when in-memory representation is UTF-16
1765 // ----------------------------------------------------------------------------
1766 // conversions without endianness change
1767 // ----------------------------------------------------------------------------
1770 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1771 const char *src
, size_t srcLen
) const
1773 srcLen
= GetLength(src
, srcLen
);
1774 if ( srcLen
== wxNO_LEN
)
1775 return wxCONV_FAILED
;
1777 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1778 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1780 for ( size_t n
= 0; n
< inLen
; n
++ )
1783 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1784 if ( numChars
== wxCONV_FAILED
)
1785 return wxCONV_FAILED
;
1790 if ( outLen
> dstLen
)
1791 return wxCONV_FAILED
;
1794 if ( numChars
== 2 )
1796 // second character of a surrogate
1806 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1807 const wchar_t *src
, size_t srcLen
) const
1809 if ( srcLen
== wxNO_LEN
)
1810 srcLen
= wxWcslen(src
) + 1;
1814 // optimization: return maximal space which could be needed for this
1815 // string instead of the exact amount which could be less if there are
1816 // any surrogates in the input
1818 // we consider that surrogates are rare enough to make it worthwhile to
1819 // avoid running the loop below at the cost of slightly extra memory
1821 return srcLen
* BYTES_PER_CHAR
;
1824 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1826 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1828 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1830 return wxCONV_FAILED
;
1832 outLen
+= BYTES_PER_CHAR
;
1834 if ( outLen
> dstLen
)
1835 return wxCONV_FAILED
;
1843 // ----------------------------------------------------------------------------
1844 // endian-reversing conversions
1845 // ----------------------------------------------------------------------------
1848 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1849 const char *src
, size_t srcLen
) const
1851 srcLen
= GetLength(src
, srcLen
);
1852 if ( srcLen
== wxNO_LEN
)
1853 return wxCONV_FAILED
;
1855 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1856 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1858 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1861 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1862 if ( numChars
== wxCONV_FAILED
)
1863 return wxCONV_FAILED
;
1868 if ( outLen
> dstLen
)
1869 return wxCONV_FAILED
;
1872 if ( numChars
== 2 )
1874 // second character of a surrogate
1884 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1885 const wchar_t *src
, size_t srcLen
) const
1887 if ( srcLen
== wxNO_LEN
)
1888 srcLen
= wxWcslen(src
) + 1;
1892 // optimization: return maximal space which could be needed for this
1893 // string instead of the exact amount which could be less if there are
1894 // any surrogates in the input
1896 // we consider that surrogates are rare enough to make it worthwhile to
1897 // avoid running the loop below at the cost of slightly extra memory
1899 return srcLen
*BYTES_PER_CHAR
;
1902 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1904 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1906 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1908 return wxCONV_FAILED
;
1910 outLen
+= BYTES_PER_CHAR
;
1912 if ( outLen
> dstLen
)
1913 return wxCONV_FAILED
;
1915 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1921 #else // !WC_UTF16: wchar_t is UTF-32
1923 // ----------------------------------------------------------------------------
1924 // conversions without endianness change
1925 // ----------------------------------------------------------------------------
1928 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1929 const char *src
, size_t srcLen
) const
1931 // use memcpy() as it should be much faster than hand-written loop
1932 srcLen
= GetLength(src
, srcLen
);
1933 if ( srcLen
== wxNO_LEN
)
1934 return wxCONV_FAILED
;
1936 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1939 if ( dstLen
< inLen
)
1940 return wxCONV_FAILED
;
1942 memcpy(dst
, src
, srcLen
);
1949 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1950 const wchar_t *src
, size_t srcLen
) const
1952 if ( srcLen
== wxNO_LEN
)
1953 srcLen
= wxWcslen(src
) + 1;
1955 srcLen
*= BYTES_PER_CHAR
;
1959 if ( dstLen
< srcLen
)
1960 return wxCONV_FAILED
;
1962 memcpy(dst
, src
, srcLen
);
1968 // ----------------------------------------------------------------------------
1969 // endian-reversing conversions
1970 // ----------------------------------------------------------------------------
1973 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1974 const char *src
, size_t srcLen
) const
1976 srcLen
= GetLength(src
, srcLen
);
1977 if ( srcLen
== wxNO_LEN
)
1978 return wxCONV_FAILED
;
1980 srcLen
/= BYTES_PER_CHAR
;
1984 if ( dstLen
< srcLen
)
1985 return wxCONV_FAILED
;
1987 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1988 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1990 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1998 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1999 const wchar_t *src
, size_t srcLen
) const
2001 if ( srcLen
== wxNO_LEN
)
2002 srcLen
= wxWcslen(src
) + 1;
2004 srcLen
*= BYTES_PER_CHAR
;
2008 if ( dstLen
< srcLen
)
2009 return wxCONV_FAILED
;
2011 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
2012 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
2014 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
2021 #endif // WC_UTF16/!WC_UTF16
2024 // ============================================================================
2025 // The classes doing conversion using the iconv_xxx() functions
2026 // ============================================================================
2030 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2031 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2032 // (unless there's yet another bug in glibc) the only case when iconv()
2033 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2034 // left in the input buffer -- when _real_ error occurs,
2035 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2037 // [This bug does not appear in glibc 2.2.]
2038 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2039 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2040 (errno != E2BIG || bufLeft != 0))
2042 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2045 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2047 #define ICONV_T_INVALID ((iconv_t)-1)
2049 #if SIZEOF_WCHAR_T == 4
2050 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2051 #define WC_ENC wxFONTENCODING_UTF32
2052 #elif SIZEOF_WCHAR_T == 2
2053 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2054 #define WC_ENC wxFONTENCODING_UTF16
2055 #else // sizeof(wchar_t) != 2 nor 4
2056 // does this ever happen?
2057 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2060 // ----------------------------------------------------------------------------
2061 // wxMBConv_iconv: encapsulates an iconv character set
2062 // ----------------------------------------------------------------------------
2064 class wxMBConv_iconv
: public wxMBConv
2067 wxMBConv_iconv(const char *name
);
2068 virtual ~wxMBConv_iconv();
2070 // implement base class virtual methods
2071 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
2072 const char *src
, size_t srcLen
= wxNO_LEN
) const;
2073 virtual size_t FromWChar(char *dst
, size_t dstLen
,
2074 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
2075 virtual size_t GetMBNulLen() const;
2077 #if wxUSE_UNICODE_UTF8
2078 virtual bool IsUTF8() const;
2081 virtual wxMBConv
*Clone() const
2083 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
2084 p
->m_minMBCharWidth
= m_minMBCharWidth
;
2089 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
2092 // the iconv handlers used to translate from multibyte
2093 // to wide char and in the other direction
2098 // guards access to m2w and w2m objects
2099 wxMutex m_iconvMutex
;
2103 // the name (for iconv_open()) of a wide char charset -- if none is
2104 // available on this machine, it will remain NULL
2105 static wxString ms_wcCharsetName
;
2107 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2108 // different endian-ness than the native one
2109 static bool ms_wcNeedsSwap
;
2112 // name of the encoding handled by this conversion
2115 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2117 size_t m_minMBCharWidth
;
2120 // make the constructor available for unit testing
2121 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2123 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2124 if ( !result
->IsOk() )
2133 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2134 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2136 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2139 m_minMBCharWidth
= 0;
2141 // check for charset that represents wchar_t:
2142 if ( ms_wcCharsetName
.empty() )
2144 wxLogTrace(TRACE_STRCONV
, wxT("Looking for wide char codeset:"));
2147 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2148 #else // !wxUSE_FONTMAP
2149 static const wxChar
*names_static
[] =
2151 #if SIZEOF_WCHAR_T == 4
2153 #elif SIZEOF_WCHAR_T = 2
2158 const wxChar
**names
= names_static
;
2159 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2161 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2163 const wxString
nameCS(*names
);
2165 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2166 wxString
nameXE(nameCS
);
2168 #ifdef WORDS_BIGENDIAN
2169 nameXE
+= wxT("BE");
2170 #else // little endian
2171 nameXE
+= wxT("LE");
2174 wxLogTrace(TRACE_STRCONV
, wxT(" trying charset \"%s\""),
2177 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2178 if ( m2w
== ICONV_T_INVALID
)
2180 // try charset w/o bytesex info (e.g. "UCS4")
2181 wxLogTrace(TRACE_STRCONV
, wxT(" trying charset \"%s\""),
2183 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2185 // and check for bytesex ourselves:
2186 if ( m2w
!= ICONV_T_INVALID
)
2188 char buf
[2], *bufPtr
;
2197 outsz
= SIZEOF_WCHAR_T
* 2;
2198 char* wbufPtr
= (char*)wbuf
;
2202 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2205 if (ICONV_FAILED(res
, insz
))
2207 wxLogLastError(wxT("iconv"));
2208 wxLogError(_("Conversion to charset '%s' doesn't work."),
2211 else // ok, can convert to this encoding, remember it
2213 ms_wcCharsetName
= nameCS
;
2214 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2218 else // use charset not requiring byte swapping
2220 ms_wcCharsetName
= nameXE
;
2224 wxLogTrace(TRACE_STRCONV
,
2225 wxT("iconv wchar_t charset is \"%s\"%s"),
2226 ms_wcCharsetName
.empty() ? wxString("<none>")
2228 ms_wcNeedsSwap
? wxT(" (needs swap)")
2231 else // we already have ms_wcCharsetName
2233 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2236 if ( ms_wcCharsetName
.empty() )
2238 w2m
= ICONV_T_INVALID
;
2242 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2243 if ( w2m
== ICONV_T_INVALID
)
2245 wxLogTrace(TRACE_STRCONV
,
2246 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2247 ms_wcCharsetName
.c_str(), name
);
2252 wxMBConv_iconv::~wxMBConv_iconv()
2254 if ( m2w
!= ICONV_T_INVALID
)
2256 if ( w2m
!= ICONV_T_INVALID
)
2261 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2262 const char *src
, size_t srcLen
) const
2264 if ( srcLen
== wxNO_LEN
)
2266 // find the string length: notice that must be done differently for
2267 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2269 const size_t nulLen
= GetMBNulLen();
2273 return wxCONV_FAILED
;
2276 srcLen
= strlen(src
); // arguably more optimized than our version
2281 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2282 // but they also have to start at character boundary and not
2283 // span two adjacent characters
2285 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2291 // when we're determining the length of the string ourselves we count
2292 // the terminating NUL(s) as part of it and always NUL-terminate the
2297 // we express length in the number of (wide) characters but iconv always
2298 // counts buffer sizes it in bytes
2299 dstLen
*= SIZEOF_WCHAR_T
;
2302 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2303 // Unfortunately there are a couple of global wxCSConv objects such as
2304 // wxConvLocal that are used all over wx code, so we have to make sure
2305 // the handle is used by at most one thread at the time. Otherwise
2306 // only a few wx classes would be safe to use from non-main threads
2307 // as MB<->WC conversion would fail "randomly".
2308 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2309 #endif // wxUSE_THREADS
2312 const char *pszPtr
= src
;
2316 char* bufPtr
= (char*)dst
;
2318 // have destination buffer, convert there
2319 size_t dstLenOrig
= dstLen
;
2321 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2324 // convert the number of bytes converted as returned by iconv to the
2325 // number of (wide) characters converted that we need
2326 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2330 // convert to native endianness
2331 for ( unsigned i
= 0; i
< res
; i
++ )
2332 dst
[i
] = WC_BSWAP(dst
[i
]);
2335 else // no destination buffer
2337 // convert using temp buffer to calculate the size of the buffer needed
2343 char* bufPtr
= (char*)tbuf
;
2344 dstLen
= 8 * SIZEOF_WCHAR_T
;
2347 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2350 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2352 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2355 if (ICONV_FAILED(cres
, srcLen
))
2357 //VS: it is ok if iconv fails, hence trace only
2358 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2359 return wxCONV_FAILED
;
2365 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2366 const wchar_t *src
, size_t srcLen
) const
2369 // NB: explained in MB2WC
2370 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2373 if ( srcLen
== wxNO_LEN
)
2374 srcLen
= wxWcslen(src
) + 1;
2376 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2377 size_t outbuflen
= dstLen
;
2380 wchar_t *tmpbuf
= 0;
2384 // need to copy to temp buffer to switch endianness
2385 // (doing WC_BSWAP twice on the original buffer won't work, as it
2386 // could be in read-only memory, or be accessed in some other thread)
2387 tmpbuf
= (wchar_t *)malloc(inbuflen
);
2388 for ( size_t i
= 0; i
< srcLen
; i
++ )
2389 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2394 char* inbuf
= (char*)src
;
2397 // have destination buffer, convert there
2398 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2400 res
= dstLen
- outbuflen
;
2402 else // no destination buffer
2404 // convert using temp buffer to calculate the size of the buffer needed
2410 outbuflen
= WXSIZEOF(tbuf
);
2412 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2414 res
+= WXSIZEOF(tbuf
) - outbuflen
;
2416 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2424 if (ICONV_FAILED(cres
, inbuflen
))
2426 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2427 return wxCONV_FAILED
;
2433 size_t wxMBConv_iconv::GetMBNulLen() const
2435 if ( m_minMBCharWidth
== 0 )
2437 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2440 // NB: explained in MB2WC
2441 wxMutexLocker
lock(self
->m_iconvMutex
);
2444 const wchar_t *wnul
= L
"";
2445 char buf
[8]; // should be enough for NUL in any encoding
2446 size_t inLen
= sizeof(wchar_t),
2447 outLen
= WXSIZEOF(buf
);
2448 char *inBuff
= (char *)wnul
;
2449 char *outBuff
= buf
;
2450 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2452 self
->m_minMBCharWidth
= (size_t)-1;
2456 self
->m_minMBCharWidth
= outBuff
- buf
;
2460 return m_minMBCharWidth
;
2463 #if wxUSE_UNICODE_UTF8
2464 bool wxMBConv_iconv::IsUTF8() const
2466 return wxStricmp(m_name
, "UTF-8") == 0 ||
2467 wxStricmp(m_name
, "UTF8") == 0;
2471 #endif // HAVE_ICONV
2474 // ============================================================================
2475 // Win32 conversion classes
2476 // ============================================================================
2478 #ifdef wxHAVE_WIN32_MB2WC
2482 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2483 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2486 class wxMBConv_win32
: public wxMBConv
2491 m_CodePage
= CP_ACP
;
2492 m_minMBCharWidth
= 0;
2495 wxMBConv_win32(const wxMBConv_win32
& conv
)
2498 m_CodePage
= conv
.m_CodePage
;
2499 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2503 wxMBConv_win32(const char* name
)
2505 m_CodePage
= wxCharsetToCodepage(name
);
2506 m_minMBCharWidth
= 0;
2509 wxMBConv_win32(wxFontEncoding encoding
)
2511 m_CodePage
= wxEncodingToCodepage(encoding
);
2512 m_minMBCharWidth
= 0;
2514 #endif // wxUSE_FONTMAP
2516 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2518 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2519 // the behaviour is not compatible with the Unix version (using iconv)
2520 // and break the library itself, e.g. wxTextInputStream::NextChar()
2521 // wouldn't work if reading an incomplete MB char didn't result in an
2524 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2525 // Win XP or newer and it is not supported for UTF-[78] so we always
2526 // use our own conversions in this case. See
2527 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2528 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2529 if ( m_CodePage
== CP_UTF8
)
2531 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2534 if ( m_CodePage
== CP_UTF7
)
2536 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2540 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2541 IsAtLeastWin2kSP4() )
2543 flags
= MB_ERR_INVALID_CHARS
;
2546 const size_t len
= ::MultiByteToWideChar
2548 m_CodePage
, // code page
2549 flags
, // flags: fall on error
2550 psz
, // input string
2551 -1, // its length (NUL-terminated)
2552 buf
, // output string
2553 buf
? n
: 0 // size of output buffer
2557 // function totally failed
2558 return wxCONV_FAILED
;
2561 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2562 // check if we succeeded, by doing a double trip:
2563 if ( !flags
&& buf
)
2565 const size_t mbLen
= strlen(psz
);
2566 wxCharBuffer
mbBuf(mbLen
);
2567 if ( ::WideCharToMultiByte
2574 mbLen
+ 1, // size in bytes, not length
2578 strcmp(mbBuf
, psz
) != 0 )
2580 // we didn't obtain the same thing we started from, hence
2581 // the conversion was lossy and we consider that it failed
2582 return wxCONV_FAILED
;
2586 // note that it returns count of written chars for buf != NULL and size
2587 // of the needed buffer for buf == NULL so in either case the length of
2588 // the string (which never includes the terminating NUL) is one less
2592 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2595 we have a problem here: by default, WideCharToMultiByte() may
2596 replace characters unrepresentable in the target code page with bad
2597 quality approximations such as turning "1/2" symbol (U+00BD) into
2598 "1" for the code pages which don't have it and we, obviously, want
2599 to avoid this at any price
2601 the trouble is that this function does it _silently_, i.e. it won't
2602 even tell us whether it did or not... Win98/2000 and higher provide
2603 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2604 we have to resort to a round trip, i.e. check that converting back
2605 results in the same string -- this is, of course, expensive but
2606 otherwise we simply can't be sure to not garble the data.
2609 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2610 // it doesn't work with CJK encodings (which we test for rather roughly
2611 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2613 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2616 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2618 // it's our lucky day
2619 flags
= WC_NO_BEST_FIT_CHARS
;
2620 pUsedDef
= &usedDef
;
2622 else // old system or unsupported encoding
2628 const size_t len
= ::WideCharToMultiByte
2630 m_CodePage
, // code page
2631 flags
, // either none or no best fit
2632 pwz
, // input string
2633 -1, // it is (wide) NUL-terminated
2634 buf
, // output buffer
2635 buf
? n
: 0, // and its size
2636 NULL
, // default "replacement" char
2637 pUsedDef
// [out] was it used?
2642 // function totally failed
2643 return wxCONV_FAILED
;
2646 // we did something, check if we really succeeded
2649 // check if the conversion failed, i.e. if any replacements
2652 return wxCONV_FAILED
;
2654 else // we must resort to double tripping...
2656 // first we need to ensure that we really have the MB data: this is
2657 // not the case if we're called with NULL buffer, in which case we
2658 // need to do the conversion yet again
2659 wxCharBuffer bufDef
;
2662 bufDef
= wxCharBuffer(len
);
2663 buf
= bufDef
.data();
2664 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2665 buf
, len
, NULL
, NULL
) )
2666 return wxCONV_FAILED
;
2671 wxWCharBuffer
wcBuf(n
);
2672 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2673 wcscmp(wcBuf
, pwz
) != 0 )
2675 // we didn't obtain the same thing we started from, hence
2676 // the conversion was lossy and we consider that it failed
2677 return wxCONV_FAILED
;
2681 // see the comment above for the reason of "len - 1"
2685 virtual size_t GetMBNulLen() const
2687 if ( m_minMBCharWidth
== 0 )
2689 int len
= ::WideCharToMultiByte
2691 m_CodePage
, // code page
2693 L
"", // input string
2694 1, // translate just the NUL
2695 NULL
, // output buffer
2697 NULL
, // no replacement char
2698 NULL
// [out] don't care if it was used
2701 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2705 wxLogDebug(wxT("Unexpected NUL length %d"), len
);
2706 self
->m_minMBCharWidth
= (size_t)-1;
2710 self
->m_minMBCharWidth
= (size_t)-1;
2716 self
->m_minMBCharWidth
= len
;
2721 return m_minMBCharWidth
;
2724 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2726 bool IsOk() const { return m_CodePage
!= -1; }
2729 static bool CanUseNoBestFit()
2731 static int s_isWin98Or2k
= -1;
2733 if ( s_isWin98Or2k
== -1 )
2736 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2738 case wxOS_WINDOWS_9X
:
2739 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2742 case wxOS_WINDOWS_NT
:
2743 s_isWin98Or2k
= verMaj
>= 5;
2747 // unknown: be conservative by default
2752 wxASSERT_MSG( s_isWin98Or2k
!= -1, wxT("should be set above") );
2755 return s_isWin98Or2k
== 1;
2758 static bool IsAtLeastWin2kSP4()
2763 static int s_isAtLeastWin2kSP4
= -1;
2765 if ( s_isAtLeastWin2kSP4
== -1 )
2767 OSVERSIONINFOEX ver
;
2769 memset(&ver
, 0, sizeof(ver
));
2770 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2771 GetVersionEx((OSVERSIONINFO
*)&ver
);
2773 s_isAtLeastWin2kSP4
=
2774 ((ver
.dwMajorVersion
> 5) || // Vista+
2775 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2776 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2777 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2781 return s_isAtLeastWin2kSP4
== 1;
2786 // the code page we're working with
2789 // cached result of GetMBNulLen(), set to 0 initially meaning
2791 size_t m_minMBCharWidth
;
2794 #endif // wxHAVE_WIN32_MB2WC
2797 // ============================================================================
2798 // wxEncodingConverter based conversion classes
2799 // ============================================================================
2803 class wxMBConv_wxwin
: public wxMBConv
2808 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2809 // The wxMBConv_cf class does a better job.
2810 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2811 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2812 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2816 // temporarily just use wxEncodingConverter stuff,
2817 // so that it works while a better implementation is built
2818 wxMBConv_wxwin(const char* name
)
2821 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2823 m_enc
= wxFONTENCODING_SYSTEM
;
2828 wxMBConv_wxwin(wxFontEncoding enc
)
2835 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2837 size_t inbuf
= strlen(psz
);
2840 if (!m2w
.Convert(psz
, buf
))
2841 return wxCONV_FAILED
;
2846 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2848 const size_t inbuf
= wxWcslen(psz
);
2851 if (!w2m
.Convert(psz
, buf
))
2852 return wxCONV_FAILED
;
2858 virtual size_t GetMBNulLen() const
2862 case wxFONTENCODING_UTF16BE
:
2863 case wxFONTENCODING_UTF16LE
:
2866 case wxFONTENCODING_UTF32BE
:
2867 case wxFONTENCODING_UTF32LE
:
2875 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2877 bool IsOk() const { return m_ok
; }
2880 wxFontEncoding m_enc
;
2881 wxEncodingConverter m2w
, w2m
;
2884 // were we initialized successfully?
2887 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin
);
2890 // make the constructors available for unit testing
2891 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2893 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2894 if ( !result
->IsOk() )
2903 #endif // wxUSE_FONTMAP
2905 // ============================================================================
2906 // wxCSConv implementation
2907 // ============================================================================
2909 void wxCSConv::Init()
2916 wxCSConv::wxCSConv(const wxString
& charset
)
2920 if ( !charset
.empty() )
2922 SetName(charset
.ToAscii());
2926 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2927 if ( m_encoding
== wxFONTENCODING_MAX
)
2929 // set to unknown/invalid value
2930 m_encoding
= wxFONTENCODING_SYSTEM
;
2932 else if ( m_encoding
== wxFONTENCODING_DEFAULT
)
2934 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2935 m_encoding
= wxFONTENCODING_ISO8859_1
;
2938 m_encoding
= wxFONTENCODING_SYSTEM
;
2942 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2944 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2946 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2948 encoding
= wxFONTENCODING_SYSTEM
;
2953 m_encoding
= encoding
;
2956 wxCSConv::~wxCSConv()
2961 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2966 SetName(conv
.m_name
);
2967 m_encoding
= conv
.m_encoding
;
2970 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2974 SetName(conv
.m_name
);
2975 m_encoding
= conv
.m_encoding
;
2980 void wxCSConv::Clear()
2989 void wxCSConv::SetName(const char *charset
)
2993 m_name
= wxStrdup(charset
);
3000 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3001 wxEncodingNameCache
);
3003 static wxEncodingNameCache gs_nameCache
;
3006 wxMBConv
*wxCSConv::DoCreate() const
3009 wxLogTrace(TRACE_STRCONV
,
3010 wxT("creating conversion for %s"),
3012 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
3013 #endif // wxUSE_FONTMAP
3015 // check for the special case of ASCII or ISO8859-1 charset: as we have
3016 // special knowledge of it anyhow, we don't need to create a special
3017 // conversion object
3018 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
3019 m_encoding
== wxFONTENCODING_DEFAULT
)
3021 // don't convert at all
3025 // we trust OS to do conversion better than we can so try external
3026 // conversion methods first
3028 // the full order is:
3029 // 1. OS conversion (iconv() under Unix or Win32 API)
3030 // 2. hard coded conversions for UTF
3031 // 3. wxEncodingConverter as fall back
3037 #endif // !wxUSE_FONTMAP
3040 wxFontEncoding
encoding(m_encoding
);
3045 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
3053 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3054 #endif // wxUSE_FONTMAP
3058 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3059 if ( it
!= gs_nameCache
.end() )
3061 if ( it
->second
.empty() )
3064 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
3071 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3072 // CS : in case this does not return valid names (eg for MacRoman)
3073 // encoding got a 'failure' entry in the cache all the same,
3074 // although it just has to be created using a different method, so
3075 // only store failed iconv creation attempts (or perhaps we
3076 // shoulnd't do this at all ?)
3077 if ( names
[0] != NULL
)
3079 for ( ; *names
; ++names
)
3081 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3082 // will need changes that will obsolete this
3083 wxString
name(*names
);
3084 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
3087 gs_nameCache
[encoding
] = *names
;
3094 gs_nameCache
[encoding
] = wxT(""); // cache the failure
3097 #endif // wxUSE_FONTMAP
3099 #endif // HAVE_ICONV
3101 #ifdef wxHAVE_WIN32_MB2WC
3104 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3105 : new wxMBConv_win32(m_encoding
);
3114 #endif // wxHAVE_WIN32_MB2WC
3118 // leave UTF16 and UTF32 to the built-ins of wx
3119 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3120 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3123 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3124 : new wxMBConv_cf(m_encoding
);
3126 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3135 #endif // __DARWIN__
3138 wxFontEncoding enc
= m_encoding
;
3140 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3142 // use "false" to suppress interactive dialogs -- we can be called from
3143 // anywhere and popping up a dialog from here is the last thing we want to
3145 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3147 #endif // wxUSE_FONTMAP
3151 case wxFONTENCODING_UTF7
:
3152 return new wxMBConvUTF7
;
3154 case wxFONTENCODING_UTF8
:
3155 return new wxMBConvUTF8
;
3157 case wxFONTENCODING_UTF16BE
:
3158 return new wxMBConvUTF16BE
;
3160 case wxFONTENCODING_UTF16LE
:
3161 return new wxMBConvUTF16LE
;
3163 case wxFONTENCODING_UTF32BE
:
3164 return new wxMBConvUTF32BE
;
3166 case wxFONTENCODING_UTF32LE
:
3167 return new wxMBConvUTF32LE
;
3170 // nothing to do but put here to suppress gcc warnings
3177 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3178 : new wxMBConv_wxwin(m_encoding
);
3185 wxLogTrace(TRACE_STRCONV
,
3186 wxT("encoding \"%s\" is not supported by this system"),
3187 (m_name
? wxString(m_name
)
3188 : wxFontMapperBase::GetEncodingName(m_encoding
)));
3189 #endif // wxUSE_FONTMAP
3194 void wxCSConv::CreateConvIfNeeded() const
3198 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3200 // if we don't have neither the name nor the encoding, use the default
3201 // encoding for this system
3202 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3205 self
->m_encoding
= wxLocale::GetSystemEncoding();
3207 // fallback to some reasonable default:
3208 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3209 #endif // wxUSE_INTL
3212 self
->m_convReal
= DoCreate();
3213 self
->m_deferred
= false;
3217 bool wxCSConv::IsOk() const
3219 CreateConvIfNeeded();
3221 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3222 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3223 return true; // always ok as we do it ourselves
3225 // m_convReal->IsOk() is called at its own creation, so we know it must
3226 // be ok if m_convReal is non-NULL
3227 return m_convReal
!= NULL
;
3230 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3231 const char *src
, size_t srcLen
) const
3233 CreateConvIfNeeded();
3236 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3239 if ( srcLen
== wxNO_LEN
)
3240 srcLen
= strlen(src
) + 1; // take trailing NUL too
3244 if ( dstLen
< srcLen
)
3245 return wxCONV_FAILED
;
3247 for ( size_t n
= 0; n
< srcLen
; n
++ )
3248 dst
[n
] = (unsigned char)(src
[n
]);
3254 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3255 const wchar_t *src
, size_t srcLen
) const
3257 CreateConvIfNeeded();
3260 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3263 if ( srcLen
== wxNO_LEN
)
3264 srcLen
= wxWcslen(src
) + 1;
3268 if ( dstLen
< srcLen
)
3269 return wxCONV_FAILED
;
3271 for ( size_t n
= 0; n
< srcLen
; n
++ )
3273 if ( src
[n
] > 0xFF )
3274 return wxCONV_FAILED
;
3276 dst
[n
] = (char)src
[n
];
3280 else // still need to check the input validity
3282 for ( size_t n
= 0; n
< srcLen
; n
++ )
3284 if ( src
[n
] > 0xFF )
3285 return wxCONV_FAILED
;
3292 size_t wxCSConv::GetMBNulLen() const
3294 CreateConvIfNeeded();
3298 return m_convReal
->GetMBNulLen();
3301 // otherwise, we are ISO-8859-1
3305 #if wxUSE_UNICODE_UTF8
3306 bool wxCSConv::IsUTF8() const
3308 CreateConvIfNeeded();
3312 return m_convReal
->IsUTF8();
3315 // otherwise, we are ISO-8859-1
3323 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3326 return wxWCharBuffer();
3328 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3330 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3332 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3337 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3340 return wxCharBuffer();
3342 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3344 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3349 #endif // wxUSE_UNICODE
3351 // ----------------------------------------------------------------------------
3353 // ----------------------------------------------------------------------------
3355 // NB: The reason why we create converted objects in this convoluted way,
3356 // using a factory function instead of global variable, is that they
3357 // may be used at static initialization time (some of them are used by
3358 // wxString ctors and there may be a global wxString object). In other
3359 // words, possibly _before_ the converter global object would be
3366 #undef wxConvISO8859_1
3368 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3369 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3370 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3372 static impl_klass name##Obj ctor_args; \
3373 return &name##Obj; \
3375 /* this ensures that all global converter objects are created */ \
3376 /* by the time static initialization is done, i.e. before any */ \
3377 /* thread is launched: */ \
3378 static klass* gs_##name##instance = wxGet_##name##Ptr()
3380 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3381 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3384 // disable warning "variable 'xxx' was declared but never referenced"
3385 #pragma warning(disable: 177)
3389 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3390 #elif 0 // defined(__WXOSX__)
3391 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_cf
, wxConvLibc
, (wxFONTENCODING_UTF8
));
3393 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3396 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3397 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3398 // provokes an error message about "not enough macro parameters"; and we
3399 // can't use "()" here as the name##Obj declaration would be parsed as a
3400 // function declaration then, so use a semicolon and live with an extra
3401 // empty statement (and hope that no compilers warns about this)
3402 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3403 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3405 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3406 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3408 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3409 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3412 // The xnu kernel always communicates file paths in decomposed UTF-8.
3413 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3414 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3417 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3420 #else // !__DARWIN__
3421 wxGet_wxConvLibcPtr();
3422 #endif // __DARWIN__/!__DARWIN__
3424 #else // !wxUSE_WCHAR_T
3426 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3427 // stand-ins in absence of wchar_t
3428 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3433 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T