1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV _T("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten
= 0;
176 // the number of NULs terminating this string
177 size_t nulLen
= 0; // not really needed, but just to avoid warnings
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
185 if ( srcLen
!= wxNO_LEN
)
187 // we need to know how to find the end of this string
188 nulLen
= GetMBNulLen();
189 if ( nulLen
== wxCONV_FAILED
)
190 return wxCONV_FAILED
;
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
197 char * const p
= bufTmp
.data();
198 memcpy(p
, src
, srcLen
);
199 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
205 srcEnd
= src
+ srcLen
;
207 else // quit after the first loop iteration
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
229 // try to convert the current chunk
230 size_t lenChunk
= MB2WC(NULL
, src
, 0);
231 if ( lenChunk
== wxCONV_FAILED
)
232 return wxCONV_FAILED
;
234 dstWritten
+= lenChunk
;
240 // nothing left in the input string, conversion succeeded
246 if ( dstWritten
> dstLen
)
247 return wxCONV_FAILED
;
249 // +1 is for trailing NUL
250 if ( MB2WC(dst
, src
, lenChunk
+ 1) == wxCONV_FAILED
)
251 return wxCONV_FAILED
;
260 // we convert just one chunk in this case as this is the entire
265 // advance the input pointer past the end of this chunk
266 while ( NotAllNULs(src
, nulLen
) )
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
275 src
+= nulLen
; // skipping over its terminator as well
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
279 // delimited by srcEnd
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
298 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
299 const wchar_t *src
, size_t srcLen
) const
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten
= 0;
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
309 // make a copy of the input string unless it is already properly
311 wxWCharBuffer bufTmp
;
312 if ( isNulTerminated
)
314 srcLen
= wxWcslen(src
) + 1;
316 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
318 // make a copy in order to properly NUL-terminate the string
319 bufTmp
= wxWCharBuffer(srcLen
);
320 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
324 const size_t lenNul
= GetMBNulLen();
325 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
327 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
329 // try to convert the current chunk
330 size_t lenChunk
= WC2MB(NULL
, src
, 0);
332 if ( lenChunk
== wxCONV_FAILED
)
333 return wxCONV_FAILED
;
335 dstWritten
+= lenChunk
;
336 if ( isNulTerminated
)
337 dstWritten
+= lenNul
;
341 if ( dstWritten
> dstLen
)
342 return wxCONV_FAILED
;
344 if ( WC2MB(dst
, src
, lenChunk
+ lenNul
) == wxCONV_FAILED
)
345 return wxCONV_FAILED
;
348 if ( isNulTerminated
)
356 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
358 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
359 if ( rc
!= wxCONV_FAILED
)
361 // ToWChar() returns the buffer length, i.e. including the trailing
362 // NUL, while this method doesn't take it into account
369 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
371 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
372 if ( rc
!= wxCONV_FAILED
)
380 wxMBConv::~wxMBConv()
382 // nothing to do here (necessary for Darwin linking probably)
385 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
389 // calculate the length of the buffer needed first
390 const size_t nLen
= ToWChar(NULL
, 0, psz
);
391 if ( nLen
!= wxCONV_FAILED
)
393 // now do the actual conversion
394 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
396 // +1 for the trailing NULL
397 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
402 return wxWCharBuffer();
405 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
409 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
410 if ( nLen
!= wxCONV_FAILED
)
412 wxCharBuffer
buf(nLen
- 1);
413 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
418 return wxCharBuffer();
422 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
424 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
425 if ( dstLen
!= wxCONV_FAILED
)
427 // notice that we allocate space for dstLen+1 wide characters here
428 // because we want the buffer to always be NUL-terminated, even if the
429 // input isn't (as otherwise the caller has no way to know its length)
430 wxWCharBuffer
wbuf(dstLen
);
431 wbuf
.data()[dstLen
] = L
'\0';
432 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
438 // we also need to handle NUL-terminated input strings
439 // specially: for them the output is the length of the string
440 // excluding the trailing NUL, however if we're asked to
441 // convert a specific number of characters we return the length
442 // of the resulting output even if it's NUL-terminated
443 if ( inLen
== wxNO_LEN
)
454 return wxWCharBuffer();
458 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
460 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
461 if ( dstLen
!= wxCONV_FAILED
)
463 const size_t nulLen
= GetMBNulLen();
465 // as above, ensure that the buffer is always NUL-terminated, even if
467 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
468 memset(buf
.data() + dstLen
, 0, nulLen
);
469 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
475 if ( inLen
== wxNO_LEN
)
477 // in this case both input and output are NUL-terminated
478 // and we're not supposed to count NUL
490 return wxCharBuffer();
493 // ----------------------------------------------------------------------------
495 // ----------------------------------------------------------------------------
497 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
499 return wxMB2WC(buf
, psz
, n
);
502 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
504 return wxWC2MB(buf
, psz
, n
);
507 // ----------------------------------------------------------------------------
508 // wxConvBrokenFileNames
509 // ----------------------------------------------------------------------------
513 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
515 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
516 wxStricmp(charset
, _T("UTF8")) == 0 )
517 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
519 m_conv
= new wxCSConv(charset
);
524 // ----------------------------------------------------------------------------
526 // ----------------------------------------------------------------------------
528 // Implementation (C) 2004 Fredrik Roubert
530 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
533 // BASE64 decoding table
535 static const unsigned char utf7unb64
[] =
537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
539 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
540 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
541 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
542 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
543 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
544 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
545 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
546 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
547 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
548 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
549 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
550 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
551 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
552 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
553 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
554 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
555 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
556 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
557 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
558 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
559 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
560 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
561 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
562 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
563 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
564 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
565 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
566 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
567 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
568 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
571 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
572 const char *src
, size_t srcLen
) const
574 DecoderState stateOrig
,
576 if ( srcLen
== wxNO_LEN
)
578 // convert the entire string, up to and including the trailing NUL
579 srcLen
= strlen(src
) + 1;
581 // when working on the entire strings we don't update nor use the shift
582 // state from the previous call
583 statePtr
= &stateOrig
;
585 else // when working with partial strings we do use the shift state
587 statePtr
= wx_const_cast(DecoderState
*, &m_stateDecoder
);
589 // also save the old state to be able to rollback to it on error
590 stateOrig
= m_stateDecoder
;
593 // but to simplify the code below we use this variable in both cases
594 DecoderState
& state
= *statePtr
;
597 // number of characters [which would have been] written to dst [if it were
601 const char * const srcEnd
= src
+ srcLen
;
603 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
605 const unsigned char cc
= *src
++;
607 if ( state
.IsShifted() )
609 const unsigned char dc
= utf7unb64
[cc
];
612 // end of encoded part, check that nothing was left: there can
613 // be up to 4 bits of 0 padding but nothing else (we also need
614 // to check isLSB as we count bits modulo 8 while a valid UTF-7
615 // encoded sequence must contain an integral number of UTF-16
617 if ( state
.isLSB
|| state
.bit
> 4 ||
618 (state
.accum
& ((1 << state
.bit
) - 1)) )
623 return wxCONV_FAILED
;
628 // re-parse this character normally below unless it's '-' which
629 // is consumed by the decoder
633 else // valid encoded character
635 // mini base64 decoder: each character is 6 bits
640 if ( state
.bit
>= 8 )
642 // got the full byte, consume it
644 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
648 // we've got the full word, output it
650 *dst
++ = (state
.msb
<< 8) | b
;
656 // just store it while we wait for LSB
664 if ( state
.IsDirect() )
666 // start of an encoded segment?
671 // just the encoded plus sign, don't switch to shifted mode
677 else if ( utf7unb64
[(unsigned)*src
] == 0xff )
679 // empty encoded chunks are not allowed
683 return wxCONV_FAILED
;
685 else // base-64 encoded chunk follows
692 // only printable 7 bit ASCII characters (with the exception of
693 // NUL, TAB, CR and LF) can be used directly
694 if ( cc
>= 0x7f || (cc
< ' ' &&
695 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
696 return wxCONV_FAILED
;
707 // as we didn't read any characters we should be called with the same
708 // data (followed by some more new data) again later so don't save our
712 return wxCONV_FAILED
;
719 // BASE64 encoding table
721 static const unsigned char utf7enb64
[] =
723 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
724 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
725 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
726 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
727 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
728 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
729 'w', 'x', 'y', 'z', '0', '1', '2', '3',
730 '4', '5', '6', '7', '8', '9', '+', '/'
734 // UTF-7 encoding table
736 // 0 - Set D (directly encoded characters)
737 // 1 - Set O (optional direct characters)
738 // 2 - whitespace characters (optional)
739 // 3 - special characters
741 static const unsigned char utf7encode
[128] =
743 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
744 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
745 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
746 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
747 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
749 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
753 static inline bool wxIsUTF7Direct(wchar_t wc
)
755 return wc
< 0x80 && utf7encode
[wc
] < 1;
758 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
759 const wchar_t *src
, size_t srcLen
) const
761 EncoderState stateOrig
,
763 if ( srcLen
== wxNO_LEN
)
765 // we don't apply the stored state when operating on entire strings at
767 statePtr
= &stateOrig
;
769 srcLen
= wxWcslen(src
) + 1;
771 else // do use the mode we left the output in previously
773 stateOrig
= m_stateEncoder
;
774 statePtr
= wx_const_cast(EncoderState
*, &m_stateEncoder
);
777 EncoderState
& state
= *statePtr
;
782 const wchar_t * const srcEnd
= src
+ srcLen
;
783 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
786 if ( wxIsUTF7Direct(cc
) )
788 if ( state
.IsShifted() )
790 // pad with zeros the last encoded block if necessary
794 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
809 else if ( cc
== '+' && state
.IsDirect() )
820 else if (((wxUint32
)cc
) > 0xffff)
822 // no surrogate pair generation (yet?)
823 return wxCONV_FAILED
;
828 if ( state
.IsDirect() )
837 // BASE64 encode string
840 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
843 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
845 for (state
.bit
+= 8; state
.bit
>= 6; )
849 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
854 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
862 // we need to restore the original encoder state if we were called just to
863 // calculate the amount of space needed as we will presumably be called
864 // again to really convert the data now
871 // ----------------------------------------------------------------------------
873 // ----------------------------------------------------------------------------
875 static const wxUint32 utf8_max
[]=
876 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
878 // boundaries of the private use area we use to (temporarily) remap invalid
879 // characters invalid in a UTF-8 encoded string
880 const wxUint32 wxUnicodePUA
= 0x100000;
881 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
883 // this table gives the length of the UTF-8 encoding from its first character:
884 const unsigned char tableUtf8Lengths
[256] = {
885 // single-byte sequences (ASCII):
886 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
887 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
891 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
892 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
895 // these are invalid:
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
902 // two-byte sequences:
903 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
904 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
906 // three-byte sequences:
907 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
909 // four-byte sequences:
910 4, 4, 4, 4, 4, // F0..F4
912 // these are invalid again (5- or 6-byte
913 // sequences and sequences for code points
914 // above U+10FFFF, as restricted by RFC 3629):
915 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
919 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
920 const char *src
, size_t srcLen
) const
922 wchar_t *out
= dstLen
? dst
: NULL
;
925 if ( srcLen
== wxNO_LEN
)
926 srcLen
= strlen(src
) + 1;
928 for ( const char *p
= src
; ; p
++ )
930 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
932 // all done successfully, just add the trailing NULL if we are not
933 // using explicit length
934 if ( srcLen
== wxNO_LEN
)
950 if ( out
&& !dstLen
-- )
954 unsigned char c
= *p
;
958 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
961 if ( srcLen
!= wxNO_LEN
)
968 unsigned len
= tableUtf8Lengths
[c
];
972 if ( srcLen
< len
) // the test works for wxNO_LEN too
975 if ( srcLen
!= wxNO_LEN
)
978 // Char. number range | UTF-8 octet sequence
979 // (hexadecimal) | (binary)
980 // ----------------------+----------------------------------------
981 // 0000 0000 - 0000 007F | 0xxxxxxx
982 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
983 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
984 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
986 // Code point value is stored in bits marked with 'x',
987 // lowest-order bit of the value on the right side in the diagram
988 // above. (from RFC 3629)
990 // mask to extract lead byte's value ('x' bits above), by sequence
992 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
994 // mask and value of lead byte's most significant bits, by length:
995 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
996 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
998 len
--; // it's more convenient to work with 0-based length here
1000 // extract the lead byte's value bits:
1001 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
1004 code
= c
& leadValueMask
[len
];
1006 // all remaining bytes, if any, are handled in the same way
1007 // regardless of sequence's length:
1008 for ( ; len
; --len
)
1011 if ( (c
& 0xC0) != 0x80 )
1012 return wxCONV_FAILED
;
1020 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1021 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
1030 #endif // WC_UTF16/!WC_UTF16
1038 return wxCONV_FAILED
;
1042 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
1043 const wchar_t *src
, size_t srcLen
) const
1045 char *out
= dstLen
? dst
: NULL
;
1048 for ( const wchar_t *wp
= src
; ; wp
++ )
1050 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
) )
1052 // all done successfully, just add the trailing NULL if we are not
1053 // using explicit length
1054 if ( srcLen
== wxNO_LEN
)
1070 if ( srcLen
!= wxNO_LEN
)
1075 // cast is ok for WC_UTF16
1076 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1078 // skip the next char too as we decoded a surrogate
1081 #else // wchar_t is UTF-32
1082 code
= *wp
& 0x7fffffff;
1094 out
[0] = (char)code
;
1097 else if ( code
<= 0x07FF )
1105 // NB: this line takes 6 least significant bits, encodes them as
1106 // 10xxxxxx and discards them so that the next byte can be encoded:
1107 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1108 out
[0] = 0xC0 | code
;
1111 else if ( code
< 0xFFFF )
1119 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1120 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1121 out
[0] = 0xE0 | code
;
1124 else if ( code
<= 0x10FFFF )
1132 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1133 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1134 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1135 out
[0] = 0xF0 | code
;
1140 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1153 // we only get here if an error occurs during decoding
1154 return wxCONV_FAILED
;
1157 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1158 const char *psz
, size_t srcLen
) const
1160 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1161 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1165 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1167 const char *opsz
= psz
;
1168 bool invalid
= false;
1169 unsigned char cc
= *psz
++, fc
= cc
;
1171 for (cnt
= 0; fc
& 0x80; cnt
++)
1181 // escape the escape character for octal escapes
1182 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1183 && cc
== '\\' && (!buf
|| len
< n
))
1195 // invalid UTF-8 sequence
1200 unsigned ocnt
= cnt
- 1;
1201 wxUint32 res
= cc
& (0x3f >> cnt
);
1205 if ((cc
& 0xC0) != 0x80)
1207 // invalid UTF-8 sequence
1213 res
= (res
<< 6) | (cc
& 0x3f);
1216 if (invalid
|| res
<= utf8_max
[ocnt
])
1218 // illegal UTF-8 encoding
1221 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1222 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1224 // if one of our PUA characters turns up externally
1225 // it must also be treated as an illegal sequence
1226 // (a bit like you have to escape an escape character)
1232 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1233 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1234 if (pa
== wxCONV_FAILED
)
1246 *buf
++ = (wchar_t)res
;
1248 #endif // WC_UTF16/!WC_UTF16
1254 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1256 while (opsz
< psz
&& (!buf
|| len
< n
))
1259 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1260 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1261 wxASSERT(pa
!= wxCONV_FAILED
);
1268 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1274 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1276 while (opsz
< psz
&& (!buf
|| len
< n
))
1278 if ( buf
&& len
+ 3 < n
)
1280 unsigned char on
= *opsz
;
1282 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1283 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1284 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1291 else // MAP_INVALID_UTF8_NOT
1293 return wxCONV_FAILED
;
1299 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1305 static inline bool isoctal(wchar_t wch
)
1307 return L
'0' <= wch
&& wch
<= L
'7';
1310 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1311 const wchar_t *psz
, size_t srcLen
) const
1313 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1314 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1318 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1323 // cast is ok for WC_UTF16
1324 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1325 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1327 cc
= (*psz
++) & 0x7fffffff;
1330 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1331 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1334 *buf
++ = (char)(cc
- wxUnicodePUA
);
1337 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1338 && cc
== L
'\\' && psz
[0] == L
'\\' )
1345 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1347 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1351 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1352 (psz
[1] - L
'0') * 010 +
1362 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1378 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1380 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1386 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1392 // ============================================================================
1394 // ============================================================================
1396 #ifdef WORDS_BIGENDIAN
1397 #define wxMBConvUTF16straight wxMBConvUTF16BE
1398 #define wxMBConvUTF16swap wxMBConvUTF16LE
1400 #define wxMBConvUTF16swap wxMBConvUTF16BE
1401 #define wxMBConvUTF16straight wxMBConvUTF16LE
1405 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1407 if ( srcLen
== wxNO_LEN
)
1409 // count the number of bytes in input, including the trailing NULs
1410 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1411 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1414 srcLen
*= BYTES_PER_CHAR
;
1416 else // we already have the length
1418 // we can only convert an entire number of UTF-16 characters
1419 if ( srcLen
% BYTES_PER_CHAR
)
1420 return wxCONV_FAILED
;
1426 // case when in-memory representation is UTF-16 too
1429 // ----------------------------------------------------------------------------
1430 // conversions without endianness change
1431 // ----------------------------------------------------------------------------
1434 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1435 const char *src
, size_t srcLen
) const
1437 // set up the scene for using memcpy() (which is presumably more efficient
1438 // than copying the bytes one by one)
1439 srcLen
= GetLength(src
, srcLen
);
1440 if ( srcLen
== wxNO_LEN
)
1441 return wxCONV_FAILED
;
1443 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1446 if ( dstLen
< inLen
)
1447 return wxCONV_FAILED
;
1449 memcpy(dst
, src
, srcLen
);
1456 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1457 const wchar_t *src
, size_t srcLen
) const
1459 if ( srcLen
== wxNO_LEN
)
1460 srcLen
= wxWcslen(src
) + 1;
1462 srcLen
*= BYTES_PER_CHAR
;
1466 if ( dstLen
< srcLen
)
1467 return wxCONV_FAILED
;
1469 memcpy(dst
, src
, srcLen
);
1475 // ----------------------------------------------------------------------------
1476 // endian-reversing conversions
1477 // ----------------------------------------------------------------------------
1480 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1481 const char *src
, size_t srcLen
) const
1483 srcLen
= GetLength(src
, srcLen
);
1484 if ( srcLen
== wxNO_LEN
)
1485 return wxCONV_FAILED
;
1487 srcLen
/= BYTES_PER_CHAR
;
1491 if ( dstLen
< srcLen
)
1492 return wxCONV_FAILED
;
1494 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1495 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1497 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1505 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1506 const wchar_t *src
, size_t srcLen
) const
1508 if ( srcLen
== wxNO_LEN
)
1509 srcLen
= wxWcslen(src
) + 1;
1511 srcLen
*= BYTES_PER_CHAR
;
1515 if ( dstLen
< srcLen
)
1516 return wxCONV_FAILED
;
1518 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1519 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1521 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1528 #else // !WC_UTF16: wchar_t is UTF-32
1530 // ----------------------------------------------------------------------------
1531 // conversions without endianness change
1532 // ----------------------------------------------------------------------------
1535 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1536 const char *src
, size_t srcLen
) const
1538 srcLen
= GetLength(src
, srcLen
);
1539 if ( srcLen
== wxNO_LEN
)
1540 return wxCONV_FAILED
;
1542 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1545 // optimization: return maximal space which could be needed for this
1546 // string even if the real size could be smaller if the buffer contains
1552 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1553 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1555 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1557 return wxCONV_FAILED
;
1559 if ( ++outLen
> dstLen
)
1560 return wxCONV_FAILED
;
1570 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1571 const wchar_t *src
, size_t srcLen
) const
1573 if ( srcLen
== wxNO_LEN
)
1574 srcLen
= wxWcslen(src
) + 1;
1577 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1578 for ( size_t n
= 0; n
< srcLen
; n
++ )
1581 const size_t numChars
= encode_utf16(*src
++, cc
);
1582 if ( numChars
== wxCONV_FAILED
)
1583 return wxCONV_FAILED
;
1585 outLen
+= numChars
* BYTES_PER_CHAR
;
1588 if ( outLen
> dstLen
)
1589 return wxCONV_FAILED
;
1592 if ( numChars
== 2 )
1594 // second character of a surrogate
1603 // ----------------------------------------------------------------------------
1604 // endian-reversing conversions
1605 // ----------------------------------------------------------------------------
1608 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1609 const char *src
, size_t srcLen
) const
1611 srcLen
= GetLength(src
, srcLen
);
1612 if ( srcLen
== wxNO_LEN
)
1613 return wxCONV_FAILED
;
1615 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1618 // optimization: return maximal space which could be needed for this
1619 // string even if the real size could be smaller if the buffer contains
1625 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1626 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1631 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1633 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1635 const size_t numChars
= decode_utf16(tmp
, ch
);
1636 if ( numChars
== wxCONV_FAILED
)
1637 return wxCONV_FAILED
;
1639 if ( numChars
== 2 )
1642 if ( ++outLen
> dstLen
)
1643 return wxCONV_FAILED
;
1653 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1654 const wchar_t *src
, size_t srcLen
) const
1656 if ( srcLen
== wxNO_LEN
)
1657 srcLen
= wxWcslen(src
) + 1;
1660 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1661 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1664 const size_t numChars
= encode_utf16(*src
, cc
);
1665 if ( numChars
== wxCONV_FAILED
)
1666 return wxCONV_FAILED
;
1668 outLen
+= numChars
* BYTES_PER_CHAR
;
1671 if ( outLen
> dstLen
)
1672 return wxCONV_FAILED
;
1674 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1675 if ( numChars
== 2 )
1677 // second character of a surrogate
1678 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1686 #endif // WC_UTF16/!WC_UTF16
1689 // ============================================================================
1691 // ============================================================================
1693 #ifdef WORDS_BIGENDIAN
1694 #define wxMBConvUTF32straight wxMBConvUTF32BE
1695 #define wxMBConvUTF32swap wxMBConvUTF32LE
1697 #define wxMBConvUTF32swap wxMBConvUTF32BE
1698 #define wxMBConvUTF32straight wxMBConvUTF32LE
1702 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1703 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1706 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1708 if ( srcLen
== wxNO_LEN
)
1710 // count the number of bytes in input, including the trailing NULs
1711 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1712 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1715 srcLen
*= BYTES_PER_CHAR
;
1717 else // we already have the length
1719 // we can only convert an entire number of UTF-32 characters
1720 if ( srcLen
% BYTES_PER_CHAR
)
1721 return wxCONV_FAILED
;
1727 // case when in-memory representation is UTF-16
1730 // ----------------------------------------------------------------------------
1731 // conversions without endianness change
1732 // ----------------------------------------------------------------------------
1735 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1736 const char *src
, size_t srcLen
) const
1738 srcLen
= GetLength(src
, srcLen
);
1739 if ( srcLen
== wxNO_LEN
)
1740 return wxCONV_FAILED
;
1742 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1743 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1745 for ( size_t n
= 0; n
< inLen
; n
++ )
1748 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1749 if ( numChars
== wxCONV_FAILED
)
1750 return wxCONV_FAILED
;
1755 if ( outLen
> dstLen
)
1756 return wxCONV_FAILED
;
1759 if ( numChars
== 2 )
1761 // second character of a surrogate
1771 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1772 const wchar_t *src
, size_t srcLen
) const
1774 if ( srcLen
== wxNO_LEN
)
1775 srcLen
= wxWcslen(src
) + 1;
1779 // optimization: return maximal space which could be needed for this
1780 // string instead of the exact amount which could be less if there are
1781 // any surrogates in the input
1783 // we consider that surrogates are rare enough to make it worthwhile to
1784 // avoid running the loop below at the cost of slightly extra memory
1786 return srcLen
* BYTES_PER_CHAR
;
1789 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1791 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1793 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1795 return wxCONV_FAILED
;
1797 outLen
+= BYTES_PER_CHAR
;
1799 if ( outLen
> dstLen
)
1800 return wxCONV_FAILED
;
1808 // ----------------------------------------------------------------------------
1809 // endian-reversing conversions
1810 // ----------------------------------------------------------------------------
1813 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1814 const char *src
, size_t srcLen
) const
1816 srcLen
= GetLength(src
, srcLen
);
1817 if ( srcLen
== wxNO_LEN
)
1818 return wxCONV_FAILED
;
1820 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1821 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1823 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1826 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1827 if ( numChars
== wxCONV_FAILED
)
1828 return wxCONV_FAILED
;
1833 if ( outLen
> dstLen
)
1834 return wxCONV_FAILED
;
1837 if ( numChars
== 2 )
1839 // second character of a surrogate
1849 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1850 const wchar_t *src
, size_t srcLen
) const
1852 if ( srcLen
== wxNO_LEN
)
1853 srcLen
= wxWcslen(src
) + 1;
1857 // optimization: return maximal space which could be needed for this
1858 // string instead of the exact amount which could be less if there are
1859 // any surrogates in the input
1861 // we consider that surrogates are rare enough to make it worthwhile to
1862 // avoid running the loop below at the cost of slightly extra memory
1864 return srcLen
*BYTES_PER_CHAR
;
1867 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1869 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1871 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1873 return wxCONV_FAILED
;
1875 outLen
+= BYTES_PER_CHAR
;
1877 if ( outLen
> dstLen
)
1878 return wxCONV_FAILED
;
1880 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1886 #else // !WC_UTF16: wchar_t is UTF-32
1888 // ----------------------------------------------------------------------------
1889 // conversions without endianness change
1890 // ----------------------------------------------------------------------------
1893 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1894 const char *src
, size_t srcLen
) const
1896 // use memcpy() as it should be much faster than hand-written loop
1897 srcLen
= GetLength(src
, srcLen
);
1898 if ( srcLen
== wxNO_LEN
)
1899 return wxCONV_FAILED
;
1901 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1904 if ( dstLen
< inLen
)
1905 return wxCONV_FAILED
;
1907 memcpy(dst
, src
, srcLen
);
1914 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1915 const wchar_t *src
, size_t srcLen
) const
1917 if ( srcLen
== wxNO_LEN
)
1918 srcLen
= wxWcslen(src
) + 1;
1920 srcLen
*= BYTES_PER_CHAR
;
1924 if ( dstLen
< srcLen
)
1925 return wxCONV_FAILED
;
1927 memcpy(dst
, src
, srcLen
);
1933 // ----------------------------------------------------------------------------
1934 // endian-reversing conversions
1935 // ----------------------------------------------------------------------------
1938 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1939 const char *src
, size_t srcLen
) const
1941 srcLen
= GetLength(src
, srcLen
);
1942 if ( srcLen
== wxNO_LEN
)
1943 return wxCONV_FAILED
;
1945 srcLen
/= BYTES_PER_CHAR
;
1949 if ( dstLen
< srcLen
)
1950 return wxCONV_FAILED
;
1952 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1953 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1955 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1963 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1964 const wchar_t *src
, size_t srcLen
) const
1966 if ( srcLen
== wxNO_LEN
)
1967 srcLen
= wxWcslen(src
) + 1;
1969 srcLen
*= BYTES_PER_CHAR
;
1973 if ( dstLen
< srcLen
)
1974 return wxCONV_FAILED
;
1976 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1977 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1979 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1986 #endif // WC_UTF16/!WC_UTF16
1989 // ============================================================================
1990 // The classes doing conversion using the iconv_xxx() functions
1991 // ============================================================================
1995 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1996 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1997 // (unless there's yet another bug in glibc) the only case when iconv()
1998 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1999 // left in the input buffer -- when _real_ error occurs,
2000 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2002 // [This bug does not appear in glibc 2.2.]
2003 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2004 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2005 (errno != E2BIG || bufLeft != 0))
2007 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2010 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2012 #define ICONV_T_INVALID ((iconv_t)-1)
2014 #if SIZEOF_WCHAR_T == 4
2015 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2016 #define WC_ENC wxFONTENCODING_UTF32
2017 #elif SIZEOF_WCHAR_T == 2
2018 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2019 #define WC_ENC wxFONTENCODING_UTF16
2020 #else // sizeof(wchar_t) != 2 nor 4
2021 // does this ever happen?
2022 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2025 // ----------------------------------------------------------------------------
2026 // wxMBConv_iconv: encapsulates an iconv character set
2027 // ----------------------------------------------------------------------------
2029 class wxMBConv_iconv
: public wxMBConv
2032 wxMBConv_iconv(const char *name
);
2033 virtual ~wxMBConv_iconv();
2035 // implement base class virtual methods
2036 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
2037 const char *src
, size_t srcLen
= wxNO_LEN
) const;
2038 virtual size_t FromWChar(char *dst
, size_t dstLen
,
2039 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
2040 virtual size_t GetMBNulLen() const;
2042 #if wxUSE_UNICODE_UTF8
2043 virtual bool IsUTF8() const;
2046 virtual wxMBConv
*Clone() const
2048 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
2049 p
->m_minMBCharWidth
= m_minMBCharWidth
;
2054 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
2057 // the iconv handlers used to translate from multibyte
2058 // to wide char and in the other direction
2063 // guards access to m2w and w2m objects
2064 wxMutex m_iconvMutex
;
2068 // the name (for iconv_open()) of a wide char charset -- if none is
2069 // available on this machine, it will remain NULL
2070 static wxString ms_wcCharsetName
;
2072 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2073 // different endian-ness than the native one
2074 static bool ms_wcNeedsSwap
;
2077 // name of the encoding handled by this conversion
2080 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2082 size_t m_minMBCharWidth
;
2085 // make the constructor available for unit testing
2086 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2088 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2089 if ( !result
->IsOk() )
2098 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2099 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2101 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2104 m_minMBCharWidth
= 0;
2106 // check for charset that represents wchar_t:
2107 if ( ms_wcCharsetName
.empty() )
2109 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
2112 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2113 #else // !wxUSE_FONTMAP
2114 static const wxChar
*names_static
[] =
2116 #if SIZEOF_WCHAR_T == 4
2118 #elif SIZEOF_WCHAR_T = 2
2123 const wxChar
**names
= names_static
;
2124 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2126 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2128 const wxString
nameCS(*names
);
2130 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2131 wxString
nameXE(nameCS
);
2133 #ifdef WORDS_BIGENDIAN
2135 #else // little endian
2139 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2142 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2143 if ( m2w
== ICONV_T_INVALID
)
2145 // try charset w/o bytesex info (e.g. "UCS4")
2146 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2148 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2150 // and check for bytesex ourselves:
2151 if ( m2w
!= ICONV_T_INVALID
)
2153 char buf
[2], *bufPtr
;
2162 outsz
= SIZEOF_WCHAR_T
* 2;
2163 char* wbufPtr
= (char*)wbuf
;
2167 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2170 if (ICONV_FAILED(res
, insz
))
2172 wxLogLastError(wxT("iconv"));
2173 wxLogError(_("Conversion to charset '%s' doesn't work."),
2176 else // ok, can convert to this encoding, remember it
2178 ms_wcCharsetName
= nameCS
;
2179 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2183 else // use charset not requiring byte swapping
2185 ms_wcCharsetName
= nameXE
;
2189 wxLogTrace(TRACE_STRCONV
,
2190 wxT("iconv wchar_t charset is \"%s\"%s"),
2191 ms_wcCharsetName
.empty() ? wxString("<none>")
2193 ms_wcNeedsSwap
? _T(" (needs swap)")
2196 else // we already have ms_wcCharsetName
2198 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2201 if ( ms_wcCharsetName
.empty() )
2203 w2m
= ICONV_T_INVALID
;
2207 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2208 if ( w2m
== ICONV_T_INVALID
)
2210 wxLogTrace(TRACE_STRCONV
,
2211 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2212 ms_wcCharsetName
.c_str(), name
);
2217 wxMBConv_iconv::~wxMBConv_iconv()
2219 if ( m2w
!= ICONV_T_INVALID
)
2221 if ( w2m
!= ICONV_T_INVALID
)
2226 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2227 const char *src
, size_t srcLen
) const
2229 if ( srcLen
== wxNO_LEN
)
2231 // find the string length: notice that must be done differently for
2232 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2234 const size_t nulLen
= GetMBNulLen();
2238 return wxCONV_FAILED
;
2241 srcLen
= strlen(src
); // arguably more optimized than our version
2246 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2247 // but they also have to start at character boundary and not
2248 // span two adjacent characters
2250 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2256 // when we're determining the length of the string ourselves we count
2257 // the terminating NUL(s) as part of it and always NUL-terminate the
2262 // we express length in the number of (wide) characters but iconv always
2263 // counts buffer sizes it in bytes
2264 dstLen
*= SIZEOF_WCHAR_T
;
2267 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2268 // Unfortunately there are a couple of global wxCSConv objects such as
2269 // wxConvLocal that are used all over wx code, so we have to make sure
2270 // the handle is used by at most one thread at the time. Otherwise
2271 // only a few wx classes would be safe to use from non-main threads
2272 // as MB<->WC conversion would fail "randomly".
2273 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2274 #endif // wxUSE_THREADS
2277 const char *pszPtr
= src
;
2281 char* bufPtr
= (char*)dst
;
2283 // have destination buffer, convert there
2284 size_t dstLenOrig
= dstLen
;
2286 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2289 // convert the number of bytes converted as returned by iconv to the
2290 // number of (wide) characters converted that we need
2291 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2295 // convert to native endianness
2296 for ( unsigned i
= 0; i
< res
; i
++ )
2297 dst
[i
] = WC_BSWAP(dst
[i
]);
2300 else // no destination buffer
2302 // convert using temp buffer to calculate the size of the buffer needed
2308 char* bufPtr
= (char*)tbuf
;
2309 dstLen
= 8 * SIZEOF_WCHAR_T
;
2312 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2315 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2317 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2320 if (ICONV_FAILED(cres
, srcLen
))
2322 //VS: it is ok if iconv fails, hence trace only
2323 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2324 return wxCONV_FAILED
;
2330 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2331 const wchar_t *src
, size_t srcLen
) const
2334 // NB: explained in MB2WC
2335 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2338 if ( srcLen
== wxNO_LEN
)
2339 srcLen
= wxWcslen(src
) + 1;
2341 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2342 size_t outbuflen
= dstLen
;
2345 wchar_t *tmpbuf
= 0;
2349 // need to copy to temp buffer to switch endianness
2350 // (doing WC_BSWAP twice on the original buffer won't help, as it
2351 // could be in read-only memory, or be accessed in some other thread)
2352 tmpbuf
= (wchar_t *)malloc(inbuflen
+ SIZEOF_WCHAR_T
);
2353 for ( size_t i
= 0; i
< srcLen
; i
++ )
2354 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2356 tmpbuf
[srcLen
] = L
'\0';
2360 char* inbuf
= (char*)src
;
2363 // have destination buffer, convert there
2364 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2366 res
= dstLen
- outbuflen
;
2368 else // no destination buffer
2370 // convert using temp buffer to calculate the size of the buffer needed
2378 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2380 res
+= 16 - outbuflen
;
2382 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2390 if (ICONV_FAILED(cres
, inbuflen
))
2392 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2393 return wxCONV_FAILED
;
2399 size_t wxMBConv_iconv::GetMBNulLen() const
2401 if ( m_minMBCharWidth
== 0 )
2403 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2406 // NB: explained in MB2WC
2407 wxMutexLocker
lock(self
->m_iconvMutex
);
2410 const wchar_t *wnul
= L
"";
2411 char buf
[8]; // should be enough for NUL in any encoding
2412 size_t inLen
= sizeof(wchar_t),
2413 outLen
= WXSIZEOF(buf
);
2414 char *inBuff
= (char *)wnul
;
2415 char *outBuff
= buf
;
2416 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2418 self
->m_minMBCharWidth
= (size_t)-1;
2422 self
->m_minMBCharWidth
= outBuff
- buf
;
2426 return m_minMBCharWidth
;
2429 #if wxUSE_UNICODE_UTF8
2430 bool wxMBConv_iconv::IsUTF8() const
2432 return wxStricmp(m_name
, "UTF-8") == 0 ||
2433 wxStricmp(m_name
, "UTF8") == 0;
2437 #endif // HAVE_ICONV
2440 // ============================================================================
2441 // Win32 conversion classes
2442 // ============================================================================
2444 #ifdef wxHAVE_WIN32_MB2WC
2448 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2449 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2452 class wxMBConv_win32
: public wxMBConv
2457 m_CodePage
= CP_ACP
;
2458 m_minMBCharWidth
= 0;
2461 wxMBConv_win32(const wxMBConv_win32
& conv
)
2464 m_CodePage
= conv
.m_CodePage
;
2465 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2469 wxMBConv_win32(const char* name
)
2471 m_CodePage
= wxCharsetToCodepage(name
);
2472 m_minMBCharWidth
= 0;
2475 wxMBConv_win32(wxFontEncoding encoding
)
2477 m_CodePage
= wxEncodingToCodepage(encoding
);
2478 m_minMBCharWidth
= 0;
2480 #endif // wxUSE_FONTMAP
2482 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2484 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2485 // the behaviour is not compatible with the Unix version (using iconv)
2486 // and break the library itself, e.g. wxTextInputStream::NextChar()
2487 // wouldn't work if reading an incomplete MB char didn't result in an
2490 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2491 // Win XP or newer and it is not supported for UTF-[78] so we always
2492 // use our own conversions in this case. See
2493 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2494 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2495 if ( m_CodePage
== CP_UTF8
)
2497 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2500 if ( m_CodePage
== CP_UTF7
)
2502 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2506 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2507 IsAtLeastWin2kSP4() )
2509 flags
= MB_ERR_INVALID_CHARS
;
2512 const size_t len
= ::MultiByteToWideChar
2514 m_CodePage
, // code page
2515 flags
, // flags: fall on error
2516 psz
, // input string
2517 -1, // its length (NUL-terminated)
2518 buf
, // output string
2519 buf
? n
: 0 // size of output buffer
2523 // function totally failed
2524 return wxCONV_FAILED
;
2527 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2528 // check if we succeeded, by doing a double trip:
2529 if ( !flags
&& buf
)
2531 const size_t mbLen
= strlen(psz
);
2532 wxCharBuffer
mbBuf(mbLen
);
2533 if ( ::WideCharToMultiByte
2540 mbLen
+ 1, // size in bytes, not length
2544 strcmp(mbBuf
, psz
) != 0 )
2546 // we didn't obtain the same thing we started from, hence
2547 // the conversion was lossy and we consider that it failed
2548 return wxCONV_FAILED
;
2552 // note that it returns count of written chars for buf != NULL and size
2553 // of the needed buffer for buf == NULL so in either case the length of
2554 // the string (which never includes the terminating NUL) is one less
2558 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2561 we have a problem here: by default, WideCharToMultiByte() may
2562 replace characters unrepresentable in the target code page with bad
2563 quality approximations such as turning "1/2" symbol (U+00BD) into
2564 "1" for the code pages which don't have it and we, obviously, want
2565 to avoid this at any price
2567 the trouble is that this function does it _silently_, i.e. it won't
2568 even tell us whether it did or not... Win98/2000 and higher provide
2569 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2570 we have to resort to a round trip, i.e. check that converting back
2571 results in the same string -- this is, of course, expensive but
2572 otherwise we simply can't be sure to not garble the data.
2575 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2576 // it doesn't work with CJK encodings (which we test for rather roughly
2577 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2579 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2582 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2584 // it's our lucky day
2585 flags
= WC_NO_BEST_FIT_CHARS
;
2586 pUsedDef
= &usedDef
;
2588 else // old system or unsupported encoding
2594 const size_t len
= ::WideCharToMultiByte
2596 m_CodePage
, // code page
2597 flags
, // either none or no best fit
2598 pwz
, // input string
2599 -1, // it is (wide) NUL-terminated
2600 buf
, // output buffer
2601 buf
? n
: 0, // and its size
2602 NULL
, // default "replacement" char
2603 pUsedDef
// [out] was it used?
2608 // function totally failed
2609 return wxCONV_FAILED
;
2612 // we did something, check if we really succeeded
2615 // check if the conversion failed, i.e. if any replacements
2618 return wxCONV_FAILED
;
2620 else // we must resort to double tripping...
2622 // first we need to ensure that we really have the MB data: this is
2623 // not the case if we're called with NULL buffer, in which case we
2624 // need to do the conversion yet again
2625 wxCharBuffer bufDef
;
2628 bufDef
= wxCharBuffer(len
);
2629 buf
= bufDef
.data();
2630 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2631 buf
, len
, NULL
, NULL
) )
2632 return wxCONV_FAILED
;
2637 wxWCharBuffer
wcBuf(n
);
2638 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2639 wcscmp(wcBuf
, pwz
) != 0 )
2641 // we didn't obtain the same thing we started from, hence
2642 // the conversion was lossy and we consider that it failed
2643 return wxCONV_FAILED
;
2647 // see the comment above for the reason of "len - 1"
2651 virtual size_t GetMBNulLen() const
2653 if ( m_minMBCharWidth
== 0 )
2655 int len
= ::WideCharToMultiByte
2657 m_CodePage
, // code page
2659 L
"", // input string
2660 1, // translate just the NUL
2661 NULL
, // output buffer
2663 NULL
, // no replacement char
2664 NULL
// [out] don't care if it was used
2667 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2671 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2672 self
->m_minMBCharWidth
= (size_t)-1;
2676 self
->m_minMBCharWidth
= (size_t)-1;
2682 self
->m_minMBCharWidth
= len
;
2687 return m_minMBCharWidth
;
2690 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2692 bool IsOk() const { return m_CodePage
!= -1; }
2695 static bool CanUseNoBestFit()
2697 static int s_isWin98Or2k
= -1;
2699 if ( s_isWin98Or2k
== -1 )
2702 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2704 case wxOS_WINDOWS_9X
:
2705 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2708 case wxOS_WINDOWS_NT
:
2709 s_isWin98Or2k
= verMaj
>= 5;
2713 // unknown: be conservative by default
2718 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2721 return s_isWin98Or2k
== 1;
2724 static bool IsAtLeastWin2kSP4()
2729 static int s_isAtLeastWin2kSP4
= -1;
2731 if ( s_isAtLeastWin2kSP4
== -1 )
2733 OSVERSIONINFOEX ver
;
2735 memset(&ver
, 0, sizeof(ver
));
2736 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2737 GetVersionEx((OSVERSIONINFO
*)&ver
);
2739 s_isAtLeastWin2kSP4
=
2740 ((ver
.dwMajorVersion
> 5) || // Vista+
2741 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2742 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2743 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2747 return s_isAtLeastWin2kSP4
== 1;
2752 // the code page we're working with
2755 // cached result of GetMBNulLen(), set to 0 initially meaning
2757 size_t m_minMBCharWidth
;
2760 #endif // wxHAVE_WIN32_MB2WC
2763 // ============================================================================
2764 // wxEncodingConverter based conversion classes
2765 // ============================================================================
2769 class wxMBConv_wxwin
: public wxMBConv
2774 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2775 // The wxMBConv_cf class does a better job.
2776 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2777 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2778 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2782 // temporarily just use wxEncodingConverter stuff,
2783 // so that it works while a better implementation is built
2784 wxMBConv_wxwin(const char* name
)
2787 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2789 m_enc
= wxFONTENCODING_SYSTEM
;
2794 wxMBConv_wxwin(wxFontEncoding enc
)
2801 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2803 size_t inbuf
= strlen(psz
);
2806 if (!m2w
.Convert(psz
, buf
))
2807 return wxCONV_FAILED
;
2812 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2814 const size_t inbuf
= wxWcslen(psz
);
2817 if (!w2m
.Convert(psz
, buf
))
2818 return wxCONV_FAILED
;
2824 virtual size_t GetMBNulLen() const
2828 case wxFONTENCODING_UTF16BE
:
2829 case wxFONTENCODING_UTF16LE
:
2832 case wxFONTENCODING_UTF32BE
:
2833 case wxFONTENCODING_UTF32LE
:
2841 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2843 bool IsOk() const { return m_ok
; }
2846 wxFontEncoding m_enc
;
2847 wxEncodingConverter m2w
, w2m
;
2850 // were we initialized successfully?
2853 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2856 // make the constructors available for unit testing
2857 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2859 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2860 if ( !result
->IsOk() )
2869 #endif // wxUSE_FONTMAP
2871 // ============================================================================
2872 // wxCSConv implementation
2873 // ============================================================================
2875 void wxCSConv::Init()
2882 wxCSConv::wxCSConv(const wxString
& charset
)
2886 if ( !charset
.empty() )
2888 SetName(charset
.ToAscii());
2892 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2893 if ( m_encoding
== wxFONTENCODING_MAX
)
2895 // set to unknown/invalid value
2896 m_encoding
= wxFONTENCODING_SYSTEM
;
2898 else if ( m_encoding
== wxFONTENCODING_DEFAULT
)
2900 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2901 m_encoding
= wxFONTENCODING_ISO8859_1
;
2904 m_encoding
= wxFONTENCODING_SYSTEM
;
2908 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2910 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2912 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2914 encoding
= wxFONTENCODING_SYSTEM
;
2919 m_encoding
= encoding
;
2922 wxCSConv::~wxCSConv()
2927 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2932 SetName(conv
.m_name
);
2933 m_encoding
= conv
.m_encoding
;
2936 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2940 SetName(conv
.m_name
);
2941 m_encoding
= conv
.m_encoding
;
2946 void wxCSConv::Clear()
2955 void wxCSConv::SetName(const char *charset
)
2959 m_name
= wxStrdup(charset
);
2966 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2967 wxEncodingNameCache
);
2969 static wxEncodingNameCache gs_nameCache
;
2972 wxMBConv
*wxCSConv::DoCreate() const
2975 wxLogTrace(TRACE_STRCONV
,
2976 wxT("creating conversion for %s"),
2978 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2979 #endif // wxUSE_FONTMAP
2981 // check for the special case of ASCII or ISO8859-1 charset: as we have
2982 // special knowledge of it anyhow, we don't need to create a special
2983 // conversion object
2984 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2985 m_encoding
== wxFONTENCODING_DEFAULT
)
2987 // don't convert at all
2991 // we trust OS to do conversion better than we can so try external
2992 // conversion methods first
2994 // the full order is:
2995 // 1. OS conversion (iconv() under Unix or Win32 API)
2996 // 2. hard coded conversions for UTF
2997 // 3. wxEncodingConverter as fall back
3003 #endif // !wxUSE_FONTMAP
3006 wxFontEncoding
encoding(m_encoding
);
3011 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
3019 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3020 #endif // wxUSE_FONTMAP
3024 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3025 if ( it
!= gs_nameCache
.end() )
3027 if ( it
->second
.empty() )
3030 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
3037 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3038 // CS : in case this does not return valid names (eg for MacRoman)
3039 // encoding got a 'failure' entry in the cache all the same,
3040 // although it just has to be created using a different method, so
3041 // only store failed iconv creation attempts (or perhaps we
3042 // shoulnd't do this at all ?)
3043 if ( names
[0] != NULL
)
3045 for ( ; *names
; ++names
)
3047 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3048 // will need changes that will obsolete this
3049 wxString
name(*names
);
3050 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
3053 gs_nameCache
[encoding
] = *names
;
3060 gs_nameCache
[encoding
] = _T(""); // cache the failure
3063 #endif // wxUSE_FONTMAP
3065 #endif // HAVE_ICONV
3067 #ifdef wxHAVE_WIN32_MB2WC
3070 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3071 : new wxMBConv_win32(m_encoding
);
3080 #endif // wxHAVE_WIN32_MB2WC
3084 // leave UTF16 and UTF32 to the built-ins of wx
3085 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3086 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3089 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3090 : new wxMBConv_cf(m_encoding
);
3092 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3101 #endif // __DARWIN__
3104 wxFontEncoding enc
= m_encoding
;
3106 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3108 // use "false" to suppress interactive dialogs -- we can be called from
3109 // anywhere and popping up a dialog from here is the last thing we want to
3111 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3113 #endif // wxUSE_FONTMAP
3117 case wxFONTENCODING_UTF7
:
3118 return new wxMBConvUTF7
;
3120 case wxFONTENCODING_UTF8
:
3121 return new wxMBConvUTF8
;
3123 case wxFONTENCODING_UTF16BE
:
3124 return new wxMBConvUTF16BE
;
3126 case wxFONTENCODING_UTF16LE
:
3127 return new wxMBConvUTF16LE
;
3129 case wxFONTENCODING_UTF32BE
:
3130 return new wxMBConvUTF32BE
;
3132 case wxFONTENCODING_UTF32LE
:
3133 return new wxMBConvUTF32LE
;
3136 // nothing to do but put here to suppress gcc warnings
3143 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3144 : new wxMBConv_wxwin(m_encoding
);
3150 #endif // wxUSE_FONTMAP
3152 // NB: This is a hack to prevent deadlock. What could otherwise happen
3153 // in Unicode build: wxConvLocal creation ends up being here
3154 // because of some failure and logs the error. But wxLog will try to
3155 // attach a timestamp, for which it will need wxConvLocal (to convert
3156 // time to char* and then wchar_t*), but that fails, tries to log the
3157 // error, but wxLog has an (already locked) critical section that
3158 // guards the static buffer.
3159 static bool alreadyLoggingError
= false;
3160 if (!alreadyLoggingError
)
3162 alreadyLoggingError
= true;
3163 wxLogError(_("Cannot convert from the charset '%s'!"),
3167 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
3168 #else // !wxUSE_FONTMAP
3169 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
3170 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3173 alreadyLoggingError
= false;
3179 void wxCSConv::CreateConvIfNeeded() const
3183 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3185 // if we don't have neither the name nor the encoding, use the default
3186 // encoding for this system
3187 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3190 self
->m_encoding
= wxLocale::GetSystemEncoding();
3192 // fallback to some reasonable default:
3193 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3194 #endif // wxUSE_INTL
3197 self
->m_convReal
= DoCreate();
3198 self
->m_deferred
= false;
3202 bool wxCSConv::IsOk() const
3204 CreateConvIfNeeded();
3206 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3207 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3208 return true; // always ok as we do it ourselves
3210 // m_convReal->IsOk() is called at its own creation, so we know it must
3211 // be ok if m_convReal is non-NULL
3212 return m_convReal
!= NULL
;
3215 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3216 const char *src
, size_t srcLen
) const
3218 CreateConvIfNeeded();
3221 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3224 if ( srcLen
== wxNO_LEN
)
3225 srcLen
= strlen(src
) + 1; // take trailing NUL too
3229 if ( dstLen
< srcLen
)
3230 return wxCONV_FAILED
;
3232 for ( size_t n
= 0; n
< srcLen
; n
++ )
3233 dst
[n
] = (unsigned char)(src
[n
]);
3239 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3240 const wchar_t *src
, size_t srcLen
) const
3242 CreateConvIfNeeded();
3245 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3248 if ( srcLen
== wxNO_LEN
)
3249 srcLen
= wxWcslen(src
) + 1;
3253 if ( dstLen
< srcLen
)
3254 return wxCONV_FAILED
;
3256 for ( size_t n
= 0; n
< srcLen
; n
++ )
3258 if ( src
[n
] > 0xFF )
3259 return wxCONV_FAILED
;
3261 dst
[n
] = (char)src
[n
];
3265 else // still need to check the input validity
3267 for ( size_t n
= 0; n
< srcLen
; n
++ )
3269 if ( src
[n
] > 0xFF )
3270 return wxCONV_FAILED
;
3277 size_t wxCSConv::GetMBNulLen() const
3279 CreateConvIfNeeded();
3283 return m_convReal
->GetMBNulLen();
3286 // otherwise, we are ISO-8859-1
3290 #if wxUSE_UNICODE_UTF8
3291 bool wxCSConv::IsUTF8() const
3293 CreateConvIfNeeded();
3297 return m_convReal
->IsUTF8();
3300 // otherwise, we are ISO-8859-1
3308 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3311 return wxWCharBuffer();
3313 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3315 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3317 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3322 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3325 return wxCharBuffer();
3327 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3329 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3334 #endif // wxUSE_UNICODE
3336 // ----------------------------------------------------------------------------
3338 // ----------------------------------------------------------------------------
3340 // NB: The reason why we create converted objects in this convoluted way,
3341 // using a factory function instead of global variable, is that they
3342 // may be used at static initialization time (some of them are used by
3343 // wxString ctors and there may be a global wxString object). In other
3344 // words, possibly _before_ the converter global object would be
3351 #undef wxConvISO8859_1
3353 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3354 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3355 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3357 static impl_klass name##Obj ctor_args; \
3358 return &name##Obj; \
3360 /* this ensures that all global converter objects are created */ \
3361 /* by the time static initialization is done, i.e. before any */ \
3362 /* thread is launched: */ \
3363 static klass* gs_##name##instance = wxGet_##name##Ptr()
3365 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3366 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3369 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3371 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3374 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3375 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3376 // provokes an error message about "not enough macro parameters"; and we
3377 // can't use "()" here as the name##Obj declaration would be parsed as a
3378 // function declaration then, so use a semicolon and live with an extra
3379 // empty statement (and hope that no compilers warns about this)
3380 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3381 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3383 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3384 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3386 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3387 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3390 // The xnu kernel always communicates file paths in decomposed UTF-8.
3391 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3392 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3395 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3398 #else // !__DARWIN__
3399 wxGet_wxConvLibcPtr();
3400 #endif // __DARWIN__/!__DARWIN__
3402 #else // !wxUSE_WCHAR_T
3404 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3405 // stand-ins in absence of wchar_t
3406 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3411 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T