1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV wxT("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(reinterpret_cast<const wxUint16
*>(*pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten
= 0;
176 // the number of NULs terminating this string
177 size_t nulLen
= 0; // not really needed, but just to avoid warnings
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
185 if ( srcLen
!= wxNO_LEN
)
187 // we need to know how to find the end of this string
188 nulLen
= GetMBNulLen();
189 if ( nulLen
== wxCONV_FAILED
)
190 return wxCONV_FAILED
;
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
197 char * const p
= bufTmp
.data();
198 memcpy(p
, src
, srcLen
);
199 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
205 srcEnd
= src
+ srcLen
;
207 else // quit after the first loop iteration
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
229 // try to convert the current chunk
230 size_t lenChunk
= MB2WC(NULL
, src
, 0);
231 if ( lenChunk
== wxCONV_FAILED
)
232 return wxCONV_FAILED
;
234 dstWritten
+= lenChunk
;
240 // nothing left in the input string, conversion succeeded
246 if ( dstWritten
> dstLen
)
247 return wxCONV_FAILED
;
249 // +1 is for trailing NUL
250 if ( MB2WC(dst
, src
, lenChunk
+ 1) == wxCONV_FAILED
)
251 return wxCONV_FAILED
;
260 // we convert just one chunk in this case as this is the entire
265 // advance the input pointer past the end of this chunk
266 while ( NotAllNULs(src
, nulLen
) )
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
275 src
+= nulLen
; // skipping over its terminator as well
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
279 // delimited by srcEnd
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
298 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
299 const wchar_t *src
, size_t srcLen
) const
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten
= 0;
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
309 // make a copy of the input string unless it is already properly
311 wxWCharBuffer bufTmp
;
312 if ( isNulTerminated
)
314 srcLen
= wxWcslen(src
) + 1;
316 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
318 // make a copy in order to properly NUL-terminate the string
319 bufTmp
= wxWCharBuffer(srcLen
);
320 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
324 const size_t lenNul
= GetMBNulLen();
325 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
327 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
329 // try to convert the current chunk
330 size_t lenChunk
= WC2MB(NULL
, src
, 0);
332 if ( lenChunk
== wxCONV_FAILED
)
333 return wxCONV_FAILED
;
335 dstWritten
+= lenChunk
;
336 if ( src
+lenChunk
< srcEnd
|| isNulTerminated
)
337 dstWritten
+= lenNul
;
341 if ( dstWritten
> dstLen
)
342 return wxCONV_FAILED
;
344 if ( WC2MB(dst
, src
, lenChunk
+ lenNul
) == wxCONV_FAILED
)
345 return wxCONV_FAILED
;
348 if ( src
+lenChunk
< srcEnd
|| isNulTerminated
)
356 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
358 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
359 if ( rc
!= wxCONV_FAILED
)
361 // ToWChar() returns the buffer length, i.e. including the trailing
362 // NUL, while this method doesn't take it into account
369 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
371 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
372 if ( rc
!= wxCONV_FAILED
)
380 wxMBConv::~wxMBConv()
382 // nothing to do here (necessary for Darwin linking probably)
385 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
389 // calculate the length of the buffer needed first
390 const size_t nLen
= ToWChar(NULL
, 0, psz
);
391 if ( nLen
!= wxCONV_FAILED
)
393 // now do the actual conversion
394 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
396 // +1 for the trailing NULL
397 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
402 return wxWCharBuffer();
405 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
409 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
410 if ( nLen
!= wxCONV_FAILED
)
412 wxCharBuffer
buf(nLen
- 1);
413 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
418 return wxCharBuffer();
422 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
424 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
425 if ( dstLen
!= wxCONV_FAILED
)
427 // notice that we allocate space for dstLen+1 wide characters here
428 // because we want the buffer to always be NUL-terminated, even if the
429 // input isn't (as otherwise the caller has no way to know its length)
430 wxWCharBuffer
wbuf(dstLen
);
431 wbuf
.data()[dstLen
] = L
'\0';
432 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
438 // we also need to handle NUL-terminated input strings
439 // specially: for them the output is the length of the string
440 // excluding the trailing NUL, however if we're asked to
441 // convert a specific number of characters we return the length
442 // of the resulting output even if it's NUL-terminated
443 if ( inLen
== wxNO_LEN
)
454 return wxWCharBuffer();
458 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
460 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
461 if ( dstLen
!= wxCONV_FAILED
)
463 const size_t nulLen
= GetMBNulLen();
465 // as above, ensure that the buffer is always NUL-terminated, even if
467 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
468 memset(buf
.data() + dstLen
, 0, nulLen
);
469 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
475 if ( inLen
== wxNO_LEN
)
477 // in this case both input and output are NUL-terminated
478 // and we're not supposed to count NUL
490 return wxCharBuffer();
493 const wxWCharBuffer
wxMBConv::cMB2WC(const wxScopedCharBuffer
& buf
) const
495 const size_t srcLen
= buf
.length();
498 const size_t dstLen
= ToWChar(NULL
, 0, buf
, srcLen
);
499 if ( dstLen
!= wxCONV_FAILED
)
501 wxWCharBuffer
wbuf(dstLen
);
502 wbuf
.data()[dstLen
] = L
'\0';
503 if ( ToWChar(wbuf
.data(), dstLen
, buf
, srcLen
) != wxCONV_FAILED
)
508 return wxWCharBuffer();
511 const wxCharBuffer
wxMBConv::cWC2MB(const wxScopedWCharBuffer
& wbuf
) const
513 const size_t srcLen
= wbuf
.length();
516 const size_t dstLen
= FromWChar(NULL
, 0, wbuf
, srcLen
);
517 if ( dstLen
!= wxCONV_FAILED
)
519 wxCharBuffer
buf(dstLen
);
520 buf
.data()[dstLen
] = '\0';
521 if ( FromWChar(buf
.data(), dstLen
, wbuf
, srcLen
) != wxCONV_FAILED
)
526 return wxCharBuffer();
529 // ----------------------------------------------------------------------------
531 // ----------------------------------------------------------------------------
533 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
535 return wxMB2WC(buf
, psz
, n
);
538 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
540 return wxWC2MB(buf
, psz
, n
);
543 // ----------------------------------------------------------------------------
544 // wxConvBrokenFileNames
545 // ----------------------------------------------------------------------------
549 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
551 if ( wxStricmp(charset
, wxT("UTF-8")) == 0 ||
552 wxStricmp(charset
, wxT("UTF8")) == 0 )
553 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
555 m_conv
= new wxCSConv(charset
);
560 // ----------------------------------------------------------------------------
562 // ----------------------------------------------------------------------------
564 // Implementation (C) 2004 Fredrik Roubert
566 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
569 // BASE64 decoding table
571 static const unsigned char utf7unb64
[] =
573 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
574 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
575 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
576 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
577 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
578 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
579 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
580 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
581 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
582 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
583 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
584 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
585 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
586 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
587 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
588 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
589 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
590 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
591 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
592 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
593 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
594 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
595 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
596 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
597 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
598 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
599 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
600 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
601 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
602 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
603 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
604 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
607 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
608 const char *src
, size_t srcLen
) const
610 DecoderState stateOrig
,
612 if ( srcLen
== wxNO_LEN
)
614 // convert the entire string, up to and including the trailing NUL
615 srcLen
= strlen(src
) + 1;
617 // when working on the entire strings we don't update nor use the shift
618 // state from the previous call
619 statePtr
= &stateOrig
;
621 else // when working with partial strings we do use the shift state
623 statePtr
= const_cast<DecoderState
*>(&m_stateDecoder
);
625 // also save the old state to be able to rollback to it on error
626 stateOrig
= m_stateDecoder
;
629 // but to simplify the code below we use this variable in both cases
630 DecoderState
& state
= *statePtr
;
633 // number of characters [which would have been] written to dst [if it were
637 const char * const srcEnd
= src
+ srcLen
;
639 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
641 const unsigned char cc
= *src
++;
643 if ( state
.IsShifted() )
645 const unsigned char dc
= utf7unb64
[cc
];
648 // end of encoded part, check that nothing was left: there can
649 // be up to 4 bits of 0 padding but nothing else (we also need
650 // to check isLSB as we count bits modulo 8 while a valid UTF-7
651 // encoded sequence must contain an integral number of UTF-16
653 if ( state
.isLSB
|| state
.bit
> 4 ||
654 (state
.accum
& ((1 << state
.bit
) - 1)) )
659 return wxCONV_FAILED
;
664 // re-parse this character normally below unless it's '-' which
665 // is consumed by the decoder
669 else // valid encoded character
671 // mini base64 decoder: each character is 6 bits
676 if ( state
.bit
>= 8 )
678 // got the full byte, consume it
680 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
684 // we've got the full word, output it
686 *dst
++ = (state
.msb
<< 8) | b
;
692 // just store it while we wait for LSB
700 if ( state
.IsDirect() )
702 // start of an encoded segment?
707 // just the encoded plus sign, don't switch to shifted mode
713 else if ( utf7unb64
[(unsigned)*src
] == 0xff )
715 // empty encoded chunks are not allowed
719 return wxCONV_FAILED
;
721 else // base-64 encoded chunk follows
728 // only printable 7 bit ASCII characters (with the exception of
729 // NUL, TAB, CR and LF) can be used directly
730 if ( cc
>= 0x7f || (cc
< ' ' &&
731 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
732 return wxCONV_FAILED
;
743 // as we didn't read any characters we should be called with the same
744 // data (followed by some more new data) again later so don't save our
748 return wxCONV_FAILED
;
755 // BASE64 encoding table
757 static const unsigned char utf7enb64
[] =
759 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
760 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
761 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
762 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
763 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
764 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
765 'w', 'x', 'y', 'z', '0', '1', '2', '3',
766 '4', '5', '6', '7', '8', '9', '+', '/'
770 // UTF-7 encoding table
772 // 0 - Set D (directly encoded characters)
773 // 1 - Set O (optional direct characters)
774 // 2 - whitespace characters (optional)
775 // 3 - special characters
777 static const unsigned char utf7encode
[128] =
779 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
780 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
781 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
783 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
785 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
789 static inline bool wxIsUTF7Direct(wchar_t wc
)
791 return wc
< 0x80 && utf7encode
[wc
] < 1;
794 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
795 const wchar_t *src
, size_t srcLen
) const
797 EncoderState stateOrig
,
799 if ( srcLen
== wxNO_LEN
)
801 // we don't apply the stored state when operating on entire strings at
803 statePtr
= &stateOrig
;
805 srcLen
= wxWcslen(src
) + 1;
807 else // do use the mode we left the output in previously
809 stateOrig
= m_stateEncoder
;
810 statePtr
= const_cast<EncoderState
*>(&m_stateEncoder
);
813 EncoderState
& state
= *statePtr
;
818 const wchar_t * const srcEnd
= src
+ srcLen
;
819 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
822 if ( wxIsUTF7Direct(cc
) )
824 if ( state
.IsShifted() )
826 // pad with zeros the last encoded block if necessary
830 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
845 else if ( cc
== '+' && state
.IsDirect() )
856 else if (((wxUint32
)cc
) > 0xffff)
858 // no surrogate pair generation (yet?)
859 return wxCONV_FAILED
;
864 if ( state
.IsDirect() )
873 // BASE64 encode string
876 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
879 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
881 for (state
.bit
+= 8; state
.bit
>= 6; )
885 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
890 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
898 // we need to restore the original encoder state if we were called just to
899 // calculate the amount of space needed as we will presumably be called
900 // again to really convert the data now
907 // ----------------------------------------------------------------------------
909 // ----------------------------------------------------------------------------
911 static const wxUint32 utf8_max
[]=
912 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
914 // boundaries of the private use area we use to (temporarily) remap invalid
915 // characters invalid in a UTF-8 encoded string
916 const wxUint32 wxUnicodePUA
= 0x100000;
917 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
919 // this table gives the length of the UTF-8 encoding from its first character:
920 const unsigned char tableUtf8Lengths
[256] = {
921 // single-byte sequences (ASCII):
922 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
923 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
924 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
925 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
926 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
927 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
928 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
929 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
931 // these are invalid:
932 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
933 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
934 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
935 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
938 // two-byte sequences:
939 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
940 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
942 // three-byte sequences:
943 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
945 // four-byte sequences:
946 4, 4, 4, 4, 4, // F0..F4
948 // these are invalid again (5- or 6-byte
949 // sequences and sequences for code points
950 // above U+10FFFF, as restricted by RFC 3629):
951 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
955 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
956 const char *src
, size_t srcLen
) const
958 wchar_t *out
= dstLen
? dst
: NULL
;
961 if ( srcLen
== wxNO_LEN
)
962 srcLen
= strlen(src
) + 1;
964 for ( const char *p
= src
; ; p
++ )
966 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
968 // all done successfully, just add the trailing NULL if we are not
969 // using explicit length
970 if ( srcLen
== wxNO_LEN
)
986 if ( out
&& !dstLen
-- )
990 unsigned char c
= *p
;
994 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
997 if ( srcLen
!= wxNO_LEN
)
1004 unsigned len
= tableUtf8Lengths
[c
];
1008 if ( srcLen
< len
) // the test works for wxNO_LEN too
1011 if ( srcLen
!= wxNO_LEN
)
1014 // Char. number range | UTF-8 octet sequence
1015 // (hexadecimal) | (binary)
1016 // ----------------------+----------------------------------------
1017 // 0000 0000 - 0000 007F | 0xxxxxxx
1018 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1019 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1020 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1022 // Code point value is stored in bits marked with 'x',
1023 // lowest-order bit of the value on the right side in the diagram
1024 // above. (from RFC 3629)
1026 // mask to extract lead byte's value ('x' bits above), by sequence
1028 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1030 // mask and value of lead byte's most significant bits, by length:
1031 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1032 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1034 len
--; // it's more convenient to work with 0-based length here
1036 // extract the lead byte's value bits:
1037 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
1040 code
= c
& leadValueMask
[len
];
1042 // all remaining bytes, if any, are handled in the same way
1043 // regardless of sequence's length:
1044 for ( ; len
; --len
)
1047 if ( (c
& 0xC0) != 0x80 )
1048 return wxCONV_FAILED
;
1056 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1057 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
1066 #endif // WC_UTF16/!WC_UTF16
1074 return wxCONV_FAILED
;
1078 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
1079 const wchar_t *src
, size_t srcLen
) const
1081 char *out
= dstLen
? dst
: NULL
;
1084 for ( const wchar_t *wp
= src
; ; wp
++ )
1086 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
) )
1088 // all done successfully, just add the trailing NULL if we are not
1089 // using explicit length
1090 if ( srcLen
== wxNO_LEN
)
1106 if ( srcLen
!= wxNO_LEN
)
1111 // cast is ok for WC_UTF16
1112 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1114 // skip the next char too as we decoded a surrogate
1117 #else // wchar_t is UTF-32
1118 code
= *wp
& 0x7fffffff;
1130 out
[0] = (char)code
;
1133 else if ( code
<= 0x07FF )
1141 // NB: this line takes 6 least significant bits, encodes them as
1142 // 10xxxxxx and discards them so that the next byte can be encoded:
1143 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1144 out
[0] = 0xC0 | code
;
1147 else if ( code
< 0xFFFF )
1155 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1156 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1157 out
[0] = 0xE0 | code
;
1160 else if ( code
<= 0x10FFFF )
1168 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1169 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1170 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1171 out
[0] = 0xF0 | code
;
1176 wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1189 // we only get here if an error occurs during decoding
1190 return wxCONV_FAILED
;
1193 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1194 const char *psz
, size_t srcLen
) const
1196 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1197 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1201 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1203 const char *opsz
= psz
;
1204 bool invalid
= false;
1205 unsigned char cc
= *psz
++, fc
= cc
;
1207 for (cnt
= 0; fc
& 0x80; cnt
++)
1217 // escape the escape character for octal escapes
1218 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1219 && cc
== '\\' && (!buf
|| len
< n
))
1231 // invalid UTF-8 sequence
1236 unsigned ocnt
= cnt
- 1;
1237 wxUint32 res
= cc
& (0x3f >> cnt
);
1241 if ((cc
& 0xC0) != 0x80)
1243 // invalid UTF-8 sequence
1249 res
= (res
<< 6) | (cc
& 0x3f);
1252 if (invalid
|| res
<= utf8_max
[ocnt
])
1254 // illegal UTF-8 encoding
1257 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1258 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1260 // if one of our PUA characters turns up externally
1261 // it must also be treated as an illegal sequence
1262 // (a bit like you have to escape an escape character)
1268 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1269 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1270 if (pa
== wxCONV_FAILED
)
1282 *buf
++ = (wchar_t)res
;
1284 #endif // WC_UTF16/!WC_UTF16
1290 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1292 while (opsz
< psz
&& (!buf
|| len
< n
))
1295 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1296 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1297 wxASSERT(pa
!= wxCONV_FAILED
);
1304 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1310 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1312 while (opsz
< psz
&& (!buf
|| len
< n
))
1314 if ( buf
&& len
+ 3 < n
)
1316 unsigned char on
= *opsz
;
1318 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1319 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1320 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1327 else // MAP_INVALID_UTF8_NOT
1329 return wxCONV_FAILED
;
1335 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1341 static inline bool isoctal(wchar_t wch
)
1343 return L
'0' <= wch
&& wch
<= L
'7';
1346 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1347 const wchar_t *psz
, size_t srcLen
) const
1349 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1350 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1354 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1359 // cast is ok for WC_UTF16
1360 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1361 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1363 cc
= (*psz
++) & 0x7fffffff;
1366 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1367 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1370 *buf
++ = (char)(cc
- wxUnicodePUA
);
1373 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1374 && cc
== L
'\\' && psz
[0] == L
'\\' )
1381 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1383 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1387 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1388 (psz
[1] - L
'0') * 010 +
1398 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1414 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1416 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1422 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1428 // ============================================================================
1430 // ============================================================================
1432 #ifdef WORDS_BIGENDIAN
1433 #define wxMBConvUTF16straight wxMBConvUTF16BE
1434 #define wxMBConvUTF16swap wxMBConvUTF16LE
1436 #define wxMBConvUTF16swap wxMBConvUTF16BE
1437 #define wxMBConvUTF16straight wxMBConvUTF16LE
1441 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1443 if ( srcLen
== wxNO_LEN
)
1445 // count the number of bytes in input, including the trailing NULs
1446 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1447 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1450 srcLen
*= BYTES_PER_CHAR
;
1452 else // we already have the length
1454 // we can only convert an entire number of UTF-16 characters
1455 if ( srcLen
% BYTES_PER_CHAR
)
1456 return wxCONV_FAILED
;
1462 // case when in-memory representation is UTF-16 too
1465 // ----------------------------------------------------------------------------
1466 // conversions without endianness change
1467 // ----------------------------------------------------------------------------
1470 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1471 const char *src
, size_t srcLen
) const
1473 // set up the scene for using memcpy() (which is presumably more efficient
1474 // than copying the bytes one by one)
1475 srcLen
= GetLength(src
, srcLen
);
1476 if ( srcLen
== wxNO_LEN
)
1477 return wxCONV_FAILED
;
1479 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1482 if ( dstLen
< inLen
)
1483 return wxCONV_FAILED
;
1485 memcpy(dst
, src
, srcLen
);
1492 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1493 const wchar_t *src
, size_t srcLen
) const
1495 if ( srcLen
== wxNO_LEN
)
1496 srcLen
= wxWcslen(src
) + 1;
1498 srcLen
*= BYTES_PER_CHAR
;
1502 if ( dstLen
< srcLen
)
1503 return wxCONV_FAILED
;
1505 memcpy(dst
, src
, srcLen
);
1511 // ----------------------------------------------------------------------------
1512 // endian-reversing conversions
1513 // ----------------------------------------------------------------------------
1516 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1517 const char *src
, size_t srcLen
) const
1519 srcLen
= GetLength(src
, srcLen
);
1520 if ( srcLen
== wxNO_LEN
)
1521 return wxCONV_FAILED
;
1523 srcLen
/= BYTES_PER_CHAR
;
1527 if ( dstLen
< srcLen
)
1528 return wxCONV_FAILED
;
1530 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1531 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1533 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1541 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1542 const wchar_t *src
, size_t srcLen
) const
1544 if ( srcLen
== wxNO_LEN
)
1545 srcLen
= wxWcslen(src
) + 1;
1547 srcLen
*= BYTES_PER_CHAR
;
1551 if ( dstLen
< srcLen
)
1552 return wxCONV_FAILED
;
1554 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1555 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1557 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1564 #else // !WC_UTF16: wchar_t is UTF-32
1566 // ----------------------------------------------------------------------------
1567 // conversions without endianness change
1568 // ----------------------------------------------------------------------------
1571 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1572 const char *src
, size_t srcLen
) const
1574 srcLen
= GetLength(src
, srcLen
);
1575 if ( srcLen
== wxNO_LEN
)
1576 return wxCONV_FAILED
;
1578 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1581 // optimization: return maximal space which could be needed for this
1582 // string even if the real size could be smaller if the buffer contains
1588 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1589 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1591 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1593 return wxCONV_FAILED
;
1595 if ( ++outLen
> dstLen
)
1596 return wxCONV_FAILED
;
1606 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1607 const wchar_t *src
, size_t srcLen
) const
1609 if ( srcLen
== wxNO_LEN
)
1610 srcLen
= wxWcslen(src
) + 1;
1613 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1614 for ( size_t n
= 0; n
< srcLen
; n
++ )
1617 const size_t numChars
= encode_utf16(*src
++, cc
);
1618 if ( numChars
== wxCONV_FAILED
)
1619 return wxCONV_FAILED
;
1621 outLen
+= numChars
* BYTES_PER_CHAR
;
1624 if ( outLen
> dstLen
)
1625 return wxCONV_FAILED
;
1628 if ( numChars
== 2 )
1630 // second character of a surrogate
1639 // ----------------------------------------------------------------------------
1640 // endian-reversing conversions
1641 // ----------------------------------------------------------------------------
1644 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1645 const char *src
, size_t srcLen
) const
1647 srcLen
= GetLength(src
, srcLen
);
1648 if ( srcLen
== wxNO_LEN
)
1649 return wxCONV_FAILED
;
1651 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1654 // optimization: return maximal space which could be needed for this
1655 // string even if the real size could be smaller if the buffer contains
1661 const wxUint16
*inBuff
= reinterpret_cast<const wxUint16
*>(src
);
1662 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1667 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1669 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1671 const size_t numChars
= decode_utf16(tmp
, ch
);
1672 if ( numChars
== wxCONV_FAILED
)
1673 return wxCONV_FAILED
;
1675 if ( numChars
== 2 )
1678 if ( ++outLen
> dstLen
)
1679 return wxCONV_FAILED
;
1689 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1690 const wchar_t *src
, size_t srcLen
) const
1692 if ( srcLen
== wxNO_LEN
)
1693 srcLen
= wxWcslen(src
) + 1;
1696 wxUint16
*outBuff
= reinterpret_cast<wxUint16
*>(dst
);
1697 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1700 const size_t numChars
= encode_utf16(*src
, cc
);
1701 if ( numChars
== wxCONV_FAILED
)
1702 return wxCONV_FAILED
;
1704 outLen
+= numChars
* BYTES_PER_CHAR
;
1707 if ( outLen
> dstLen
)
1708 return wxCONV_FAILED
;
1710 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1711 if ( numChars
== 2 )
1713 // second character of a surrogate
1714 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1722 #endif // WC_UTF16/!WC_UTF16
1725 // ============================================================================
1727 // ============================================================================
1729 #ifdef WORDS_BIGENDIAN
1730 #define wxMBConvUTF32straight wxMBConvUTF32BE
1731 #define wxMBConvUTF32swap wxMBConvUTF32LE
1733 #define wxMBConvUTF32swap wxMBConvUTF32BE
1734 #define wxMBConvUTF32straight wxMBConvUTF32LE
1738 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1739 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1742 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1744 if ( srcLen
== wxNO_LEN
)
1746 // count the number of bytes in input, including the trailing NULs
1747 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1748 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1751 srcLen
*= BYTES_PER_CHAR
;
1753 else // we already have the length
1755 // we can only convert an entire number of UTF-32 characters
1756 if ( srcLen
% BYTES_PER_CHAR
)
1757 return wxCONV_FAILED
;
1763 // case when in-memory representation is UTF-16
1766 // ----------------------------------------------------------------------------
1767 // conversions without endianness change
1768 // ----------------------------------------------------------------------------
1771 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1772 const char *src
, size_t srcLen
) const
1774 srcLen
= GetLength(src
, srcLen
);
1775 if ( srcLen
== wxNO_LEN
)
1776 return wxCONV_FAILED
;
1778 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1779 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1781 for ( size_t n
= 0; n
< inLen
; n
++ )
1784 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1785 if ( numChars
== wxCONV_FAILED
)
1786 return wxCONV_FAILED
;
1791 if ( outLen
> dstLen
)
1792 return wxCONV_FAILED
;
1795 if ( numChars
== 2 )
1797 // second character of a surrogate
1807 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1808 const wchar_t *src
, size_t srcLen
) const
1810 if ( srcLen
== wxNO_LEN
)
1811 srcLen
= wxWcslen(src
) + 1;
1815 // optimization: return maximal space which could be needed for this
1816 // string instead of the exact amount which could be less if there are
1817 // any surrogates in the input
1819 // we consider that surrogates are rare enough to make it worthwhile to
1820 // avoid running the loop below at the cost of slightly extra memory
1822 return srcLen
* BYTES_PER_CHAR
;
1825 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1827 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1829 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1831 return wxCONV_FAILED
;
1833 outLen
+= BYTES_PER_CHAR
;
1835 if ( outLen
> dstLen
)
1836 return wxCONV_FAILED
;
1844 // ----------------------------------------------------------------------------
1845 // endian-reversing conversions
1846 // ----------------------------------------------------------------------------
1849 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1850 const char *src
, size_t srcLen
) const
1852 srcLen
= GetLength(src
, srcLen
);
1853 if ( srcLen
== wxNO_LEN
)
1854 return wxCONV_FAILED
;
1856 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1857 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1859 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1862 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1863 if ( numChars
== wxCONV_FAILED
)
1864 return wxCONV_FAILED
;
1869 if ( outLen
> dstLen
)
1870 return wxCONV_FAILED
;
1873 if ( numChars
== 2 )
1875 // second character of a surrogate
1885 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1886 const wchar_t *src
, size_t srcLen
) const
1888 if ( srcLen
== wxNO_LEN
)
1889 srcLen
= wxWcslen(src
) + 1;
1893 // optimization: return maximal space which could be needed for this
1894 // string instead of the exact amount which could be less if there are
1895 // any surrogates in the input
1897 // we consider that surrogates are rare enough to make it worthwhile to
1898 // avoid running the loop below at the cost of slightly extra memory
1900 return srcLen
*BYTES_PER_CHAR
;
1903 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
1905 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1907 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1909 return wxCONV_FAILED
;
1911 outLen
+= BYTES_PER_CHAR
;
1913 if ( outLen
> dstLen
)
1914 return wxCONV_FAILED
;
1916 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1922 #else // !WC_UTF16: wchar_t is UTF-32
1924 // ----------------------------------------------------------------------------
1925 // conversions without endianness change
1926 // ----------------------------------------------------------------------------
1929 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1930 const char *src
, size_t srcLen
) const
1932 // use memcpy() as it should be much faster than hand-written loop
1933 srcLen
= GetLength(src
, srcLen
);
1934 if ( srcLen
== wxNO_LEN
)
1935 return wxCONV_FAILED
;
1937 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1940 if ( dstLen
< inLen
)
1941 return wxCONV_FAILED
;
1943 memcpy(dst
, src
, srcLen
);
1950 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1951 const wchar_t *src
, size_t srcLen
) const
1953 if ( srcLen
== wxNO_LEN
)
1954 srcLen
= wxWcslen(src
) + 1;
1956 srcLen
*= BYTES_PER_CHAR
;
1960 if ( dstLen
< srcLen
)
1961 return wxCONV_FAILED
;
1963 memcpy(dst
, src
, srcLen
);
1969 // ----------------------------------------------------------------------------
1970 // endian-reversing conversions
1971 // ----------------------------------------------------------------------------
1974 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1975 const char *src
, size_t srcLen
) const
1977 srcLen
= GetLength(src
, srcLen
);
1978 if ( srcLen
== wxNO_LEN
)
1979 return wxCONV_FAILED
;
1981 srcLen
/= BYTES_PER_CHAR
;
1985 if ( dstLen
< srcLen
)
1986 return wxCONV_FAILED
;
1988 const wxUint32
*inBuff
= reinterpret_cast<const wxUint32
*>(src
);
1989 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1991 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1999 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
2000 const wchar_t *src
, size_t srcLen
) const
2002 if ( srcLen
== wxNO_LEN
)
2003 srcLen
= wxWcslen(src
) + 1;
2005 srcLen
*= BYTES_PER_CHAR
;
2009 if ( dstLen
< srcLen
)
2010 return wxCONV_FAILED
;
2012 wxUint32
*outBuff
= reinterpret_cast<wxUint32
*>(dst
);
2013 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
2015 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
2022 #endif // WC_UTF16/!WC_UTF16
2025 // ============================================================================
2026 // The classes doing conversion using the iconv_xxx() functions
2027 // ============================================================================
2031 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2032 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2033 // (unless there's yet another bug in glibc) the only case when iconv()
2034 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2035 // left in the input buffer -- when _real_ error occurs,
2036 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2038 // [This bug does not appear in glibc 2.2.]
2039 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2040 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2041 (errno != E2BIG || bufLeft != 0))
2043 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2046 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2048 #define ICONV_T_INVALID ((iconv_t)-1)
2050 #if SIZEOF_WCHAR_T == 4
2051 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2052 #define WC_ENC wxFONTENCODING_UTF32
2053 #elif SIZEOF_WCHAR_T == 2
2054 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2055 #define WC_ENC wxFONTENCODING_UTF16
2056 #else // sizeof(wchar_t) != 2 nor 4
2057 // does this ever happen?
2058 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2061 // ----------------------------------------------------------------------------
2062 // wxMBConv_iconv: encapsulates an iconv character set
2063 // ----------------------------------------------------------------------------
2065 class wxMBConv_iconv
: public wxMBConv
2068 wxMBConv_iconv(const char *name
);
2069 virtual ~wxMBConv_iconv();
2071 // implement base class virtual methods
2072 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
2073 const char *src
, size_t srcLen
= wxNO_LEN
) const;
2074 virtual size_t FromWChar(char *dst
, size_t dstLen
,
2075 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
2076 virtual size_t GetMBNulLen() const;
2078 #if wxUSE_UNICODE_UTF8
2079 virtual bool IsUTF8() const;
2082 virtual wxMBConv
*Clone() const
2084 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
2085 p
->m_minMBCharWidth
= m_minMBCharWidth
;
2090 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
2093 // the iconv handlers used to translate from multibyte
2094 // to wide char and in the other direction
2099 // guards access to m2w and w2m objects
2100 wxMutex m_iconvMutex
;
2104 // the name (for iconv_open()) of a wide char charset -- if none is
2105 // available on this machine, it will remain NULL
2106 static wxString ms_wcCharsetName
;
2108 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2109 // different endian-ness than the native one
2110 static bool ms_wcNeedsSwap
;
2113 // name of the encoding handled by this conversion
2116 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2118 size_t m_minMBCharWidth
;
2121 // make the constructor available for unit testing
2122 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2124 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2125 if ( !result
->IsOk() )
2134 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2135 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2137 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2140 m_minMBCharWidth
= 0;
2142 // check for charset that represents wchar_t:
2143 if ( ms_wcCharsetName
.empty() )
2145 wxLogTrace(TRACE_STRCONV
, wxT("Looking for wide char codeset:"));
2148 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2149 #else // !wxUSE_FONTMAP
2150 static const wxChar
*names_static
[] =
2152 #if SIZEOF_WCHAR_T == 4
2154 #elif SIZEOF_WCHAR_T = 2
2159 const wxChar
**names
= names_static
;
2160 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2162 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2164 const wxString
nameCS(*names
);
2166 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2167 wxString
nameXE(nameCS
);
2169 #ifdef WORDS_BIGENDIAN
2170 nameXE
+= wxT("BE");
2171 #else // little endian
2172 nameXE
+= wxT("LE");
2175 wxLogTrace(TRACE_STRCONV
, wxT(" trying charset \"%s\""),
2178 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2179 if ( m2w
== ICONV_T_INVALID
)
2181 // try charset w/o bytesex info (e.g. "UCS4")
2182 wxLogTrace(TRACE_STRCONV
, wxT(" trying charset \"%s\""),
2184 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2186 // and check for bytesex ourselves:
2187 if ( m2w
!= ICONV_T_INVALID
)
2189 char buf
[2], *bufPtr
;
2198 outsz
= SIZEOF_WCHAR_T
* 2;
2199 char* wbufPtr
= (char*)wbuf
;
2203 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2206 if (ICONV_FAILED(res
, insz
))
2208 wxLogLastError(wxT("iconv"));
2209 wxLogError(_("Conversion to charset '%s' doesn't work."),
2212 else // ok, can convert to this encoding, remember it
2214 ms_wcCharsetName
= nameCS
;
2215 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2219 else // use charset not requiring byte swapping
2221 ms_wcCharsetName
= nameXE
;
2225 wxLogTrace(TRACE_STRCONV
,
2226 wxT("iconv wchar_t charset is \"%s\"%s"),
2227 ms_wcCharsetName
.empty() ? wxString("<none>")
2229 ms_wcNeedsSwap
? wxT(" (needs swap)")
2232 else // we already have ms_wcCharsetName
2234 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2237 if ( ms_wcCharsetName
.empty() )
2239 w2m
= ICONV_T_INVALID
;
2243 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2244 if ( w2m
== ICONV_T_INVALID
)
2246 wxLogTrace(TRACE_STRCONV
,
2247 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2248 ms_wcCharsetName
.c_str(), name
);
2253 wxMBConv_iconv::~wxMBConv_iconv()
2255 if ( m2w
!= ICONV_T_INVALID
)
2257 if ( w2m
!= ICONV_T_INVALID
)
2262 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2263 const char *src
, size_t srcLen
) const
2265 if ( srcLen
== wxNO_LEN
)
2267 // find the string length: notice that must be done differently for
2268 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2270 const size_t nulLen
= GetMBNulLen();
2274 return wxCONV_FAILED
;
2277 srcLen
= strlen(src
); // arguably more optimized than our version
2282 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2283 // but they also have to start at character boundary and not
2284 // span two adjacent characters
2286 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2292 // when we're determining the length of the string ourselves we count
2293 // the terminating NUL(s) as part of it and always NUL-terminate the
2298 // we express length in the number of (wide) characters but iconv always
2299 // counts buffer sizes it in bytes
2300 dstLen
*= SIZEOF_WCHAR_T
;
2303 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2304 // Unfortunately there are a couple of global wxCSConv objects such as
2305 // wxConvLocal that are used all over wx code, so we have to make sure
2306 // the handle is used by at most one thread at the time. Otherwise
2307 // only a few wx classes would be safe to use from non-main threads
2308 // as MB<->WC conversion would fail "randomly".
2309 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2310 #endif // wxUSE_THREADS
2313 const char *pszPtr
= src
;
2317 char* bufPtr
= (char*)dst
;
2319 // have destination buffer, convert there
2320 size_t dstLenOrig
= dstLen
;
2322 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2325 // convert the number of bytes converted as returned by iconv to the
2326 // number of (wide) characters converted that we need
2327 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2331 // convert to native endianness
2332 for ( unsigned i
= 0; i
< res
; i
++ )
2333 dst
[i
] = WC_BSWAP(dst
[i
]);
2336 else // no destination buffer
2338 // convert using temp buffer to calculate the size of the buffer needed
2344 char* bufPtr
= (char*)tbuf
;
2345 dstLen
= 8 * SIZEOF_WCHAR_T
;
2348 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2351 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2353 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2356 if (ICONV_FAILED(cres
, srcLen
))
2358 //VS: it is ok if iconv fails, hence trace only
2359 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2360 return wxCONV_FAILED
;
2366 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2367 const wchar_t *src
, size_t srcLen
) const
2370 // NB: explained in MB2WC
2371 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2374 if ( srcLen
== wxNO_LEN
)
2375 srcLen
= wxWcslen(src
) + 1;
2377 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2378 size_t outbuflen
= dstLen
;
2381 wchar_t *tmpbuf
= 0;
2385 // need to copy to temp buffer to switch endianness
2386 // (doing WC_BSWAP twice on the original buffer won't work, as it
2387 // could be in read-only memory, or be accessed in some other thread)
2388 tmpbuf
= (wchar_t *)malloc(inbuflen
);
2389 for ( size_t i
= 0; i
< srcLen
; i
++ )
2390 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2395 char* inbuf
= (char*)src
;
2398 // have destination buffer, convert there
2399 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2401 res
= dstLen
- outbuflen
;
2403 else // no destination buffer
2405 // convert using temp buffer to calculate the size of the buffer needed
2411 outbuflen
= WXSIZEOF(tbuf
);
2413 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2415 res
+= WXSIZEOF(tbuf
) - outbuflen
;
2417 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2425 if (ICONV_FAILED(cres
, inbuflen
))
2427 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2428 return wxCONV_FAILED
;
2434 size_t wxMBConv_iconv::GetMBNulLen() const
2436 if ( m_minMBCharWidth
== 0 )
2438 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2441 // NB: explained in MB2WC
2442 wxMutexLocker
lock(self
->m_iconvMutex
);
2445 const wchar_t *wnul
= L
"";
2446 char buf
[8]; // should be enough for NUL in any encoding
2447 size_t inLen
= sizeof(wchar_t),
2448 outLen
= WXSIZEOF(buf
);
2449 char *inBuff
= (char *)wnul
;
2450 char *outBuff
= buf
;
2451 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2453 self
->m_minMBCharWidth
= (size_t)-1;
2457 self
->m_minMBCharWidth
= outBuff
- buf
;
2461 return m_minMBCharWidth
;
2464 #if wxUSE_UNICODE_UTF8
2465 bool wxMBConv_iconv::IsUTF8() const
2467 return wxStricmp(m_name
, "UTF-8") == 0 ||
2468 wxStricmp(m_name
, "UTF8") == 0;
2472 #endif // HAVE_ICONV
2475 // ============================================================================
2476 // Win32 conversion classes
2477 // ============================================================================
2479 #ifdef wxHAVE_WIN32_MB2WC
2483 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2484 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2487 class wxMBConv_win32
: public wxMBConv
2492 m_CodePage
= CP_ACP
;
2493 m_minMBCharWidth
= 0;
2496 wxMBConv_win32(const wxMBConv_win32
& conv
)
2499 m_CodePage
= conv
.m_CodePage
;
2500 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2504 wxMBConv_win32(const char* name
)
2506 m_CodePage
= wxCharsetToCodepage(name
);
2507 m_minMBCharWidth
= 0;
2510 wxMBConv_win32(wxFontEncoding encoding
)
2512 m_CodePage
= wxEncodingToCodepage(encoding
);
2513 m_minMBCharWidth
= 0;
2515 #endif // wxUSE_FONTMAP
2517 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2519 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2520 // the behaviour is not compatible with the Unix version (using iconv)
2521 // and break the library itself, e.g. wxTextInputStream::NextChar()
2522 // wouldn't work if reading an incomplete MB char didn't result in an
2525 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2526 // Win XP or newer and it is not supported for UTF-[78] so we always
2527 // use our own conversions in this case. See
2528 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2529 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2530 if ( m_CodePage
== CP_UTF8
)
2532 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2535 if ( m_CodePage
== CP_UTF7
)
2537 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2541 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2542 IsAtLeastWin2kSP4() )
2544 flags
= MB_ERR_INVALID_CHARS
;
2547 const size_t len
= ::MultiByteToWideChar
2549 m_CodePage
, // code page
2550 flags
, // flags: fall on error
2551 psz
, // input string
2552 -1, // its length (NUL-terminated)
2553 buf
, // output string
2554 buf
? n
: 0 // size of output buffer
2558 // function totally failed
2559 return wxCONV_FAILED
;
2562 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2563 // check if we succeeded, by doing a double trip:
2564 if ( !flags
&& buf
)
2566 const size_t mbLen
= strlen(psz
);
2567 wxCharBuffer
mbBuf(mbLen
);
2568 if ( ::WideCharToMultiByte
2575 mbLen
+ 1, // size in bytes, not length
2579 strcmp(mbBuf
, psz
) != 0 )
2581 // we didn't obtain the same thing we started from, hence
2582 // the conversion was lossy and we consider that it failed
2583 return wxCONV_FAILED
;
2587 // note that it returns count of written chars for buf != NULL and size
2588 // of the needed buffer for buf == NULL so in either case the length of
2589 // the string (which never includes the terminating NUL) is one less
2593 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2596 we have a problem here: by default, WideCharToMultiByte() may
2597 replace characters unrepresentable in the target code page with bad
2598 quality approximations such as turning "1/2" symbol (U+00BD) into
2599 "1" for the code pages which don't have it and we, obviously, want
2600 to avoid this at any price
2602 the trouble is that this function does it _silently_, i.e. it won't
2603 even tell us whether it did or not... Win98/2000 and higher provide
2604 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2605 we have to resort to a round trip, i.e. check that converting back
2606 results in the same string -- this is, of course, expensive but
2607 otherwise we simply can't be sure to not garble the data.
2610 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2611 // it doesn't work with CJK encodings (which we test for rather roughly
2612 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2614 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2617 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2619 // it's our lucky day
2620 flags
= WC_NO_BEST_FIT_CHARS
;
2621 pUsedDef
= &usedDef
;
2623 else // old system or unsupported encoding
2629 const size_t len
= ::WideCharToMultiByte
2631 m_CodePage
, // code page
2632 flags
, // either none or no best fit
2633 pwz
, // input string
2634 -1, // it is (wide) NUL-terminated
2635 buf
, // output buffer
2636 buf
? n
: 0, // and its size
2637 NULL
, // default "replacement" char
2638 pUsedDef
// [out] was it used?
2643 // function totally failed
2644 return wxCONV_FAILED
;
2647 // we did something, check if we really succeeded
2650 // check if the conversion failed, i.e. if any replacements
2653 return wxCONV_FAILED
;
2655 else // we must resort to double tripping...
2657 // first we need to ensure that we really have the MB data: this is
2658 // not the case if we're called with NULL buffer, in which case we
2659 // need to do the conversion yet again
2660 wxCharBuffer bufDef
;
2663 bufDef
= wxCharBuffer(len
);
2664 buf
= bufDef
.data();
2665 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2666 buf
, len
, NULL
, NULL
) )
2667 return wxCONV_FAILED
;
2672 wxWCharBuffer
wcBuf(n
);
2673 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2674 wcscmp(wcBuf
, pwz
) != 0 )
2676 // we didn't obtain the same thing we started from, hence
2677 // the conversion was lossy and we consider that it failed
2678 return wxCONV_FAILED
;
2682 // see the comment above for the reason of "len - 1"
2686 virtual size_t GetMBNulLen() const
2688 if ( m_minMBCharWidth
== 0 )
2690 int len
= ::WideCharToMultiByte
2692 m_CodePage
, // code page
2694 L
"", // input string
2695 1, // translate just the NUL
2696 NULL
, // output buffer
2698 NULL
, // no replacement char
2699 NULL
// [out] don't care if it was used
2702 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2706 wxLogDebug(wxT("Unexpected NUL length %d"), len
);
2707 self
->m_minMBCharWidth
= (size_t)-1;
2711 self
->m_minMBCharWidth
= (size_t)-1;
2717 self
->m_minMBCharWidth
= len
;
2722 return m_minMBCharWidth
;
2725 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2727 bool IsOk() const { return m_CodePage
!= -1; }
2730 static bool CanUseNoBestFit()
2732 static int s_isWin98Or2k
= -1;
2734 if ( s_isWin98Or2k
== -1 )
2737 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2739 case wxOS_WINDOWS_9X
:
2740 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2743 case wxOS_WINDOWS_NT
:
2744 s_isWin98Or2k
= verMaj
>= 5;
2748 // unknown: be conservative by default
2753 wxASSERT_MSG( s_isWin98Or2k
!= -1, wxT("should be set above") );
2756 return s_isWin98Or2k
== 1;
2759 static bool IsAtLeastWin2kSP4()
2764 static int s_isAtLeastWin2kSP4
= -1;
2766 if ( s_isAtLeastWin2kSP4
== -1 )
2768 OSVERSIONINFOEX ver
;
2770 memset(&ver
, 0, sizeof(ver
));
2771 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2772 GetVersionEx((OSVERSIONINFO
*)&ver
);
2774 s_isAtLeastWin2kSP4
=
2775 ((ver
.dwMajorVersion
> 5) || // Vista+
2776 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2777 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2778 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2782 return s_isAtLeastWin2kSP4
== 1;
2787 // the code page we're working with
2790 // cached result of GetMBNulLen(), set to 0 initially meaning
2792 size_t m_minMBCharWidth
;
2795 #endif // wxHAVE_WIN32_MB2WC
2798 // ============================================================================
2799 // wxEncodingConverter based conversion classes
2800 // ============================================================================
2804 class wxMBConv_wxwin
: public wxMBConv
2809 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2810 // The wxMBConv_cf class does a better job.
2811 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2812 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2813 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2817 // temporarily just use wxEncodingConverter stuff,
2818 // so that it works while a better implementation is built
2819 wxMBConv_wxwin(const char* name
)
2822 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2824 m_enc
= wxFONTENCODING_SYSTEM
;
2829 wxMBConv_wxwin(wxFontEncoding enc
)
2836 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2838 size_t inbuf
= strlen(psz
);
2841 if (!m2w
.Convert(psz
, buf
))
2842 return wxCONV_FAILED
;
2847 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2849 const size_t inbuf
= wxWcslen(psz
);
2852 if (!w2m
.Convert(psz
, buf
))
2853 return wxCONV_FAILED
;
2859 virtual size_t GetMBNulLen() const
2863 case wxFONTENCODING_UTF16BE
:
2864 case wxFONTENCODING_UTF16LE
:
2867 case wxFONTENCODING_UTF32BE
:
2868 case wxFONTENCODING_UTF32LE
:
2876 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2878 bool IsOk() const { return m_ok
; }
2881 wxFontEncoding m_enc
;
2882 wxEncodingConverter m2w
, w2m
;
2885 // were we initialized successfully?
2888 wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin
);
2891 // make the constructors available for unit testing
2892 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2894 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2895 if ( !result
->IsOk() )
2904 #endif // wxUSE_FONTMAP
2906 // ============================================================================
2907 // wxCSConv implementation
2908 // ============================================================================
2910 void wxCSConv::Init()
2917 wxCSConv::wxCSConv(const wxString
& charset
)
2921 if ( !charset
.empty() )
2923 SetName(charset
.ToAscii());
2927 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2928 if ( m_encoding
== wxFONTENCODING_MAX
)
2930 // set to unknown/invalid value
2931 m_encoding
= wxFONTENCODING_SYSTEM
;
2933 else if ( m_encoding
== wxFONTENCODING_DEFAULT
)
2935 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2936 m_encoding
= wxFONTENCODING_ISO8859_1
;
2939 m_encoding
= wxFONTENCODING_SYSTEM
;
2943 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2945 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2947 wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2949 encoding
= wxFONTENCODING_SYSTEM
;
2954 m_encoding
= encoding
;
2957 wxCSConv::~wxCSConv()
2962 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2967 SetName(conv
.m_name
);
2968 m_encoding
= conv
.m_encoding
;
2971 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2975 SetName(conv
.m_name
);
2976 m_encoding
= conv
.m_encoding
;
2981 void wxCSConv::Clear()
2990 void wxCSConv::SetName(const char *charset
)
2994 m_name
= wxStrdup(charset
);
3001 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
3002 wxEncodingNameCache
);
3004 static wxEncodingNameCache gs_nameCache
;
3007 wxMBConv
*wxCSConv::DoCreate() const
3010 wxLogTrace(TRACE_STRCONV
,
3011 wxT("creating conversion for %s"),
3013 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
3014 #endif // wxUSE_FONTMAP
3016 // check for the special case of ASCII or ISO8859-1 charset: as we have
3017 // special knowledge of it anyhow, we don't need to create a special
3018 // conversion object
3019 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
3020 m_encoding
== wxFONTENCODING_DEFAULT
)
3022 // don't convert at all
3026 // we trust OS to do conversion better than we can so try external
3027 // conversion methods first
3029 // the full order is:
3030 // 1. OS conversion (iconv() under Unix or Win32 API)
3031 // 2. hard coded conversions for UTF
3032 // 3. wxEncodingConverter as fall back
3038 #endif // !wxUSE_FONTMAP
3041 wxFontEncoding
encoding(m_encoding
);
3046 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
3054 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3055 #endif // wxUSE_FONTMAP
3059 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3060 if ( it
!= gs_nameCache
.end() )
3062 if ( it
->second
.empty() )
3065 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
3072 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3073 // CS : in case this does not return valid names (eg for MacRoman)
3074 // encoding got a 'failure' entry in the cache all the same,
3075 // although it just has to be created using a different method, so
3076 // only store failed iconv creation attempts (or perhaps we
3077 // shoulnd't do this at all ?)
3078 if ( names
[0] != NULL
)
3080 for ( ; *names
; ++names
)
3082 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3083 // will need changes that will obsolete this
3084 wxString
name(*names
);
3085 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
3088 gs_nameCache
[encoding
] = *names
;
3095 gs_nameCache
[encoding
] = wxT(""); // cache the failure
3098 #endif // wxUSE_FONTMAP
3100 #endif // HAVE_ICONV
3102 #ifdef wxHAVE_WIN32_MB2WC
3105 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3106 : new wxMBConv_win32(m_encoding
);
3115 #endif // wxHAVE_WIN32_MB2WC
3119 // leave UTF16 and UTF32 to the built-ins of wx
3120 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3121 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3124 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3125 : new wxMBConv_cf(m_encoding
);
3127 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3136 #endif // __DARWIN__
3139 wxFontEncoding enc
= m_encoding
;
3141 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3143 // use "false" to suppress interactive dialogs -- we can be called from
3144 // anywhere and popping up a dialog from here is the last thing we want to
3146 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3148 #endif // wxUSE_FONTMAP
3152 case wxFONTENCODING_UTF7
:
3153 return new wxMBConvUTF7
;
3155 case wxFONTENCODING_UTF8
:
3156 return new wxMBConvUTF8
;
3158 case wxFONTENCODING_UTF16BE
:
3159 return new wxMBConvUTF16BE
;
3161 case wxFONTENCODING_UTF16LE
:
3162 return new wxMBConvUTF16LE
;
3164 case wxFONTENCODING_UTF32BE
:
3165 return new wxMBConvUTF32BE
;
3167 case wxFONTENCODING_UTF32LE
:
3168 return new wxMBConvUTF32LE
;
3171 // nothing to do but put here to suppress gcc warnings
3178 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3179 : new wxMBConv_wxwin(m_encoding
);
3186 wxLogTrace(TRACE_STRCONV
,
3187 wxT("encoding \"%s\" is not supported by this system"),
3188 (m_name
? wxString(m_name
)
3189 : wxFontMapperBase::GetEncodingName(m_encoding
)));
3190 #endif // wxUSE_FONTMAP
3195 void wxCSConv::CreateConvIfNeeded() const
3199 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3201 // if we don't have neither the name nor the encoding, use the default
3202 // encoding for this system
3203 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3206 self
->m_encoding
= wxLocale::GetSystemEncoding();
3208 // fallback to some reasonable default:
3209 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3210 #endif // wxUSE_INTL
3213 self
->m_convReal
= DoCreate();
3214 self
->m_deferred
= false;
3218 bool wxCSConv::IsOk() const
3220 CreateConvIfNeeded();
3222 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3223 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3224 return true; // always ok as we do it ourselves
3226 // m_convReal->IsOk() is called at its own creation, so we know it must
3227 // be ok if m_convReal is non-NULL
3228 return m_convReal
!= NULL
;
3231 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3232 const char *src
, size_t srcLen
) const
3234 CreateConvIfNeeded();
3237 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3240 if ( srcLen
== wxNO_LEN
)
3241 srcLen
= strlen(src
) + 1; // take trailing NUL too
3245 if ( dstLen
< srcLen
)
3246 return wxCONV_FAILED
;
3248 for ( size_t n
= 0; n
< srcLen
; n
++ )
3249 dst
[n
] = (unsigned char)(src
[n
]);
3255 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3256 const wchar_t *src
, size_t srcLen
) const
3258 CreateConvIfNeeded();
3261 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3264 if ( srcLen
== wxNO_LEN
)
3265 srcLen
= wxWcslen(src
) + 1;
3269 if ( dstLen
< srcLen
)
3270 return wxCONV_FAILED
;
3272 for ( size_t n
= 0; n
< srcLen
; n
++ )
3274 if ( src
[n
] > 0xFF )
3275 return wxCONV_FAILED
;
3277 dst
[n
] = (char)src
[n
];
3281 else // still need to check the input validity
3283 for ( size_t n
= 0; n
< srcLen
; n
++ )
3285 if ( src
[n
] > 0xFF )
3286 return wxCONV_FAILED
;
3293 size_t wxCSConv::GetMBNulLen() const
3295 CreateConvIfNeeded();
3299 return m_convReal
->GetMBNulLen();
3302 // otherwise, we are ISO-8859-1
3306 #if wxUSE_UNICODE_UTF8
3307 bool wxCSConv::IsUTF8() const
3309 CreateConvIfNeeded();
3313 return m_convReal
->IsUTF8();
3316 // otherwise, we are ISO-8859-1
3324 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3327 return wxWCharBuffer();
3329 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3331 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3333 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3338 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3341 return wxCharBuffer();
3343 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3345 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3350 #endif // wxUSE_UNICODE
3352 // ----------------------------------------------------------------------------
3354 // ----------------------------------------------------------------------------
3356 // NB: The reason why we create converted objects in this convoluted way,
3357 // using a factory function instead of global variable, is that they
3358 // may be used at static initialization time (some of them are used by
3359 // wxString ctors and there may be a global wxString object). In other
3360 // words, possibly _before_ the converter global object would be
3367 #undef wxConvISO8859_1
3369 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3370 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3371 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3373 static impl_klass name##Obj ctor_args; \
3374 return &name##Obj; \
3376 /* this ensures that all global converter objects are created */ \
3377 /* by the time static initialization is done, i.e. before any */ \
3378 /* thread is launched: */ \
3379 static klass* gs_##name##instance = wxGet_##name##Ptr()
3381 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3382 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3385 // disable warning "variable 'xxx' was declared but never referenced"
3386 #pragma warning(disable: 177)
3390 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3391 #elif 0 // defined(__WXOSX__)
3392 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_cf
, wxConvLibc
, (wxFONTENCODING_UTF8
));
3394 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3397 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3398 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3399 // provokes an error message about "not enough macro parameters"; and we
3400 // can't use "()" here as the name##Obj declaration would be parsed as a
3401 // function declaration then, so use a semicolon and live with an extra
3402 // empty statement (and hope that no compilers warns about this)
3403 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3404 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3406 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3407 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3409 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3410 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3413 // The xnu kernel always communicates file paths in decomposed UTF-8.
3414 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3415 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3418 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3421 #else // !__DARWIN__
3422 wxGet_wxConvLibcPtr();
3423 #endif // __DARWIN__/!__DARWIN__
3425 #else // !wxUSE_WCHAR_T
3427 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3428 // stand-ins in absence of wchar_t
3429 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3434 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T