1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/osx/core/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV _T("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existing ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // moreover, some conversion classes simply can't implement ToWChar()
170 // directly, the primary example is wxConvLibc: mbstowcs() only handles
171 // NUL-terminated strings
173 // the number of chars [which would be] written to dst [if it were not NULL]
174 size_t dstWritten
= 0;
176 // the number of NULs terminating this string
177 size_t nulLen
= 0; // not really needed, but just to avoid warnings
179 // if we were not given the input size we just have to assume that the
180 // string is properly terminated as we have no way of knowing how long it
181 // is anyhow, but if we do have the size check whether there are enough
185 if ( srcLen
!= wxNO_LEN
)
187 // we need to know how to find the end of this string
188 nulLen
= GetMBNulLen();
189 if ( nulLen
== wxCONV_FAILED
)
190 return wxCONV_FAILED
;
192 // if there are enough NULs we can avoid the copy
193 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
195 // make a copy in order to properly NUL-terminate the string
196 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
197 char * const p
= bufTmp
.data();
198 memcpy(p
, src
, srcLen
);
199 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
205 srcEnd
= src
+ srcLen
;
207 else // quit after the first loop iteration
212 // the idea of this code is straightforward: it converts a NUL-terminated
213 // chunk of the string during each iteration and updates the output buffer
216 // all the complication come from the fact that this function, for
217 // historical reasons, must behave in 2 subtly different ways when it's
218 // called with a fixed number of characters and when it's called for the
219 // entire NUL-terminated string: in the former case (srcEnd == NULL) we
220 // must count all characters we convert, NUL or not; but in the latter we
221 // do not count the trailing NUL -- but still count all the NULs inside the
224 // so for the (simple) former case we just always count the trailing NUL,
225 // but for the latter we need to wait until we see if there is going to be
226 // another loop iteration and only count it then
229 // try to convert the current chunk
230 size_t lenChunk
= MB2WC(NULL
, src
, 0);
231 if ( lenChunk
== wxCONV_FAILED
)
232 return wxCONV_FAILED
;
234 dstWritten
+= lenChunk
;
240 // nothing left in the input string, conversion succeeded
246 if ( dstWritten
> dstLen
)
247 return wxCONV_FAILED
;
249 // +1 is for trailing NUL
250 if ( MB2WC(dst
, src
, lenChunk
+ 1) == wxCONV_FAILED
)
251 return wxCONV_FAILED
;
260 // we convert just one chunk in this case as this is the entire
265 // advance the input pointer past the end of this chunk
266 while ( NotAllNULs(src
, nulLen
) )
268 // notice that we must skip over multiple bytes here as we suppose
269 // that if NUL takes 2 or 4 bytes, then all the other characters do
270 // too and so if advanced by a single byte we might erroneously
271 // detect sequences of NUL bytes in the middle of the input
275 src
+= nulLen
; // skipping over its terminator as well
277 // note that ">=" (and not just "==") is needed here as the terminator
278 // we skipped just above could be inside or just after the buffer
279 // delimited by srcEnd
283 // if we got here then this wasn't the last chunk in this string and
284 // hence we must count an extra char for L'\0' even when converting a
285 // fixed number of characters
298 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
299 const wchar_t *src
, size_t srcLen
) const
301 // the number of chars [which would be] written to dst [if it were not NULL]
302 size_t dstWritten
= 0;
304 // if we don't know its length we have no choice but to assume that it is
305 // NUL-terminated (notice that it can still be NUL-terminated even if
306 // explicit length is given but it doesn't change our return value)
307 const bool isNulTerminated
= srcLen
== wxNO_LEN
;
309 // make a copy of the input string unless it is already properly
311 wxWCharBuffer bufTmp
;
312 if ( isNulTerminated
)
314 srcLen
= wxWcslen(src
) + 1;
316 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
318 // make a copy in order to properly NUL-terminate the string
319 bufTmp
= wxWCharBuffer(srcLen
);
320 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
324 const size_t lenNul
= GetMBNulLen();
325 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
327 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
329 // try to convert the current chunk
330 size_t lenChunk
= WC2MB(NULL
, src
, 0);
332 if ( lenChunk
== wxCONV_FAILED
)
333 return wxCONV_FAILED
;
335 dstWritten
+= lenChunk
;
336 if ( isNulTerminated
)
337 dstWritten
+= lenNul
;
341 if ( dstWritten
> dstLen
)
342 return wxCONV_FAILED
;
344 if ( WC2MB(dst
, src
, lenChunk
+ lenNul
) == wxCONV_FAILED
)
345 return wxCONV_FAILED
;
348 if ( isNulTerminated
)
356 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
358 // add 1 to available buffer length because MB2WC() parameter counts the
359 // number of non-NUL characters while ToWChar() counts everything
360 size_t rc
= ToWChar(outBuff
, outLen
+ 1, inBuff
);
361 if ( rc
!= wxCONV_FAILED
)
363 // ToWChar() returns the buffer length, i.e. including the trailing
364 // NUL, while this method doesn't take it into account
371 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
373 const size_t nulLen
= GetMBNulLen();
375 size_t rc
= FromWChar(outBuff
, outLen
+ nulLen
, inBuff
);
376 if ( rc
!= wxCONV_FAILED
)
384 wxMBConv::~wxMBConv()
386 // nothing to do here (necessary for Darwin linking probably)
389 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
393 // calculate the length of the buffer needed first
394 const size_t nLen
= ToWChar(NULL
, 0, psz
);
395 if ( nLen
!= wxCONV_FAILED
)
397 // now do the actual conversion
398 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
400 // +1 for the trailing NULL
401 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
406 return wxWCharBuffer();
409 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
413 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
414 if ( nLen
!= wxCONV_FAILED
)
416 wxCharBuffer
buf(nLen
- 1);
417 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
422 return wxCharBuffer();
426 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
428 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
429 if ( dstLen
!= wxCONV_FAILED
)
431 // notice that we allocate space for dstLen+1 wide characters here
432 // because we want the buffer to always be NUL-terminated, even if the
433 // input isn't (as otherwise the caller has no way to know its length)
434 wxWCharBuffer
wbuf(dstLen
);
435 wbuf
.data()[dstLen
] = L
'\0';
436 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
442 // we also need to handle NUL-terminated input strings
443 // specially: for them the output is the length of the string
444 // excluding the trailing NUL, however if we're asked to
445 // convert a specific number of characters we return the length
446 // of the resulting output even if it's NUL-terminated
447 if ( inLen
== wxNO_LEN
)
458 return wxWCharBuffer();
462 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
464 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
465 if ( dstLen
!= wxCONV_FAILED
)
467 const size_t nulLen
= GetMBNulLen();
469 // as above, ensure that the buffer is always NUL-terminated, even if
471 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
472 memset(buf
.data() + dstLen
, 0, nulLen
);
473 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
479 if ( inLen
== wxNO_LEN
)
481 // in this case both input and output are NUL-terminated
482 // and we're not supposed to count NUL
494 return wxCharBuffer();
497 // ----------------------------------------------------------------------------
499 // ----------------------------------------------------------------------------
501 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
503 return wxMB2WC(buf
, psz
, n
);
506 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
508 return wxWC2MB(buf
, psz
, n
);
511 // ----------------------------------------------------------------------------
512 // wxConvBrokenFileNames
513 // ----------------------------------------------------------------------------
517 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
519 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
520 wxStricmp(charset
, _T("UTF8")) == 0 )
521 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
523 m_conv
= new wxCSConv(charset
);
528 // ----------------------------------------------------------------------------
530 // ----------------------------------------------------------------------------
532 // Implementation (C) 2004 Fredrik Roubert
534 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
537 // BASE64 decoding table
539 static const unsigned char utf7unb64
[] =
541 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
542 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
543 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
544 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
545 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
546 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
547 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
548 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
549 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
550 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
551 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
552 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
553 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
554 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
555 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
556 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
557 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
558 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
559 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
560 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
561 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
562 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
563 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
564 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
565 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
566 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
567 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
568 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
569 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
570 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
571 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
572 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
575 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
576 const char *src
, size_t srcLen
) const
578 DecoderState stateOrig
,
580 if ( srcLen
== wxNO_LEN
)
582 // convert the entire string, up to and including the trailing NUL
583 srcLen
= strlen(src
) + 1;
585 // when working on the entire strings we don't update nor use the shift
586 // state from the previous call
587 statePtr
= &stateOrig
;
589 else // when working with partial strings we do use the shift state
591 statePtr
= wx_const_cast(DecoderState
*, &m_stateDecoder
);
593 // also save the old state to be able to rollback to it on error
594 stateOrig
= m_stateDecoder
;
597 // but to simplify the code below we use this variable in both cases
598 DecoderState
& state
= *statePtr
;
601 // number of characters [which would have been] written to dst [if it were
605 const char * const srcEnd
= src
+ srcLen
;
607 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
609 const unsigned char cc
= *src
++;
611 if ( state
.IsShifted() )
613 const unsigned char dc
= utf7unb64
[cc
];
616 // end of encoded part, check that nothing was left: there can
617 // be up to 4 bits of 0 padding but nothing else (we also need
618 // to check isLSB as we count bits modulo 8 while a valid UTF-7
619 // encoded sequence must contain an integral number of UTF-16
621 if ( state
.isLSB
|| state
.bit
> 4 ||
622 (state
.accum
& ((1 << state
.bit
) - 1)) )
627 return wxCONV_FAILED
;
632 // re-parse this character normally below unless it's '-' which
633 // is consumed by the decoder
637 else // valid encoded character
639 // mini base64 decoder: each character is 6 bits
644 if ( state
.bit
>= 8 )
646 // got the full byte, consume it
648 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
652 // we've got the full word, output it
654 *dst
++ = (state
.msb
<< 8) | b
;
660 // just store it while we wait for LSB
668 if ( state
.IsDirect() )
670 // start of an encoded segment?
675 // just the encoded plus sign, don't switch to shifted mode
681 else if ( utf7unb64
[(unsigned)*src
] == 0xff )
683 // empty encoded chunks are not allowed
687 return wxCONV_FAILED
;
689 else // base-64 encoded chunk follows
696 // only printable 7 bit ASCII characters (with the exception of
697 // NUL, TAB, CR and LF) can be used directly
698 if ( cc
>= 0x7f || (cc
< ' ' &&
699 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
700 return wxCONV_FAILED
;
711 // as we didn't read any characters we should be called with the same
712 // data (followed by some more new data) again later so don't save our
716 return wxCONV_FAILED
;
723 // BASE64 encoding table
725 static const unsigned char utf7enb64
[] =
727 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
728 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
729 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
730 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
731 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
732 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
733 'w', 'x', 'y', 'z', '0', '1', '2', '3',
734 '4', '5', '6', '7', '8', '9', '+', '/'
738 // UTF-7 encoding table
740 // 0 - Set D (directly encoded characters)
741 // 1 - Set O (optional direct characters)
742 // 2 - whitespace characters (optional)
743 // 3 - special characters
745 static const unsigned char utf7encode
[128] =
747 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
748 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
749 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
750 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
751 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
753 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
754 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
757 static inline bool wxIsUTF7Direct(wchar_t wc
)
759 return wc
< 0x80 && utf7encode
[wc
] < 1;
762 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
763 const wchar_t *src
, size_t srcLen
) const
765 EncoderState stateOrig
,
767 if ( srcLen
== wxNO_LEN
)
769 // we don't apply the stored state when operating on entire strings at
771 statePtr
= &stateOrig
;
773 srcLen
= wxWcslen(src
) + 1;
775 else // do use the mode we left the output in previously
777 stateOrig
= m_stateEncoder
;
778 statePtr
= wx_const_cast(EncoderState
*, &m_stateEncoder
);
781 EncoderState
& state
= *statePtr
;
786 const wchar_t * const srcEnd
= src
+ srcLen
;
787 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
790 if ( wxIsUTF7Direct(cc
) )
792 if ( state
.IsShifted() )
794 // pad with zeros the last encoded block if necessary
798 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
813 else if ( cc
== '+' && state
.IsDirect() )
824 else if (((wxUint32
)cc
) > 0xffff)
826 // no surrogate pair generation (yet?)
827 return wxCONV_FAILED
;
832 if ( state
.IsDirect() )
841 // BASE64 encode string
844 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
847 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
849 for (state
.bit
+= 8; state
.bit
>= 6; )
853 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
858 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
866 // we need to restore the original encoder state if we were called just to
867 // calculate the amount of space needed as we will presumably be called
868 // again to really convert the data now
875 // ----------------------------------------------------------------------------
877 // ----------------------------------------------------------------------------
879 static const wxUint32 utf8_max
[]=
880 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
882 // boundaries of the private use area we use to (temporarily) remap invalid
883 // characters invalid in a UTF-8 encoded string
884 const wxUint32 wxUnicodePUA
= 0x100000;
885 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
887 // this table gives the length of the UTF-8 encoding from its first character:
888 const unsigned char tableUtf8Lengths
[256] = {
889 // single-byte sequences (ASCII):
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
891 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
892 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
894 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
895 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
896 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
897 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
899 // these are invalid:
900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
902 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
903 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
906 // two-byte sequences:
907 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
908 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
910 // three-byte sequences:
911 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
913 // four-byte sequences:
914 4, 4, 4, 4, 4, // F0..F4
916 // these are invalid again (5- or 6-byte
917 // sequences and sequences for code points
918 // above U+10FFFF, as restricted by RFC 3629):
919 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
923 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
924 const char *src
, size_t srcLen
) const
926 wchar_t *out
= dstLen
? dst
: NULL
;
929 if ( srcLen
== wxNO_LEN
)
930 srcLen
= strlen(src
) + 1;
932 for ( const char *p
= src
; ; p
++ )
934 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
936 // all done successfully, just add the trailing NULL if we are not
937 // using explicit length
938 if ( srcLen
== wxNO_LEN
)
954 if ( out
&& !dstLen
-- )
958 unsigned char c
= *p
;
962 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
965 if ( srcLen
!= wxNO_LEN
)
972 unsigned len
= tableUtf8Lengths
[c
];
976 if ( srcLen
< len
) // the test works for wxNO_LEN too
979 if ( srcLen
!= wxNO_LEN
)
982 // Char. number range | UTF-8 octet sequence
983 // (hexadecimal) | (binary)
984 // ----------------------+----------------------------------------
985 // 0000 0000 - 0000 007F | 0xxxxxxx
986 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
987 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
988 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
990 // Code point value is stored in bits marked with 'x',
991 // lowest-order bit of the value on the right side in the diagram
992 // above. (from RFC 3629)
994 // mask to extract lead byte's value ('x' bits above), by sequence
996 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
998 // mask and value of lead byte's most significant bits, by length:
999 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1000 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1002 len
--; // it's more convenient to work with 0-based length here
1004 // extract the lead byte's value bits:
1005 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
1008 code
= c
& leadValueMask
[len
];
1010 // all remaining bytes, if any, are handled in the same way
1011 // regardless of sequence's length:
1012 for ( ; len
; --len
)
1015 if ( (c
& 0xC0) != 0x80 )
1016 return wxCONV_FAILED
;
1024 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1025 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
1034 #endif // WC_UTF16/!WC_UTF16
1042 return wxCONV_FAILED
;
1046 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
1047 const wchar_t *src
, size_t srcLen
) const
1049 char *out
= dstLen
? dst
: NULL
;
1052 for ( const wchar_t *wp
= src
; ; wp
++ )
1054 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
) )
1056 // all done successfully, just add the trailing NULL if we are not
1057 // using explicit length
1058 if ( srcLen
== wxNO_LEN
)
1074 if ( srcLen
!= wxNO_LEN
)
1079 // cast is ok for WC_UTF16
1080 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1082 // skip the next char too as we decoded a surrogate
1085 #else // wchar_t is UTF-32
1086 code
= *wp
& 0x7fffffff;
1098 out
[0] = (char)code
;
1101 else if ( code
<= 0x07FF )
1109 // NB: this line takes 6 least significant bits, encodes them as
1110 // 10xxxxxx and discards them so that the next byte can be encoded:
1111 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1112 out
[0] = 0xC0 | code
;
1115 else if ( code
< 0xFFFF )
1123 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1124 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1125 out
[0] = 0xE0 | code
;
1128 else if ( code
<= 0x10FFFF )
1136 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1137 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1138 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1139 out
[0] = 0xF0 | code
;
1144 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1157 // we only get here if an error occurs during decoding
1158 return wxCONV_FAILED
;
1161 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1162 const char *psz
, size_t srcLen
) const
1164 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1165 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1169 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1171 const char *opsz
= psz
;
1172 bool invalid
= false;
1173 unsigned char cc
= *psz
++, fc
= cc
;
1175 for (cnt
= 0; fc
& 0x80; cnt
++)
1185 // escape the escape character for octal escapes
1186 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1187 && cc
== '\\' && (!buf
|| len
< n
))
1199 // invalid UTF-8 sequence
1204 unsigned ocnt
= cnt
- 1;
1205 wxUint32 res
= cc
& (0x3f >> cnt
);
1209 if ((cc
& 0xC0) != 0x80)
1211 // invalid UTF-8 sequence
1217 res
= (res
<< 6) | (cc
& 0x3f);
1220 if (invalid
|| res
<= utf8_max
[ocnt
])
1222 // illegal UTF-8 encoding
1225 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1226 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1228 // if one of our PUA characters turns up externally
1229 // it must also be treated as an illegal sequence
1230 // (a bit like you have to escape an escape character)
1236 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1237 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1238 if (pa
== wxCONV_FAILED
)
1250 *buf
++ = (wchar_t)res
;
1252 #endif // WC_UTF16/!WC_UTF16
1258 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1260 while (opsz
< psz
&& (!buf
|| len
< n
))
1263 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1264 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1265 wxASSERT(pa
!= wxCONV_FAILED
);
1272 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1278 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1280 while (opsz
< psz
&& (!buf
|| len
< n
))
1282 if ( buf
&& len
+ 3 < n
)
1284 unsigned char on
= *opsz
;
1286 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1287 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1288 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1295 else // MAP_INVALID_UTF8_NOT
1297 return wxCONV_FAILED
;
1303 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1309 static inline bool isoctal(wchar_t wch
)
1311 return L
'0' <= wch
&& wch
<= L
'7';
1314 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1315 const wchar_t *psz
, size_t srcLen
) const
1317 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1318 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1322 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1327 // cast is ok for WC_UTF16
1328 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1329 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1331 cc
= (*psz
++) & 0x7fffffff;
1334 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1335 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1338 *buf
++ = (char)(cc
- wxUnicodePUA
);
1341 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1342 && cc
== L
'\\' && psz
[0] == L
'\\' )
1349 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1351 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1355 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1356 (psz
[1] - L
'0') * 010 +
1366 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1382 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1384 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1390 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1396 // ============================================================================
1398 // ============================================================================
1400 #ifdef WORDS_BIGENDIAN
1401 #define wxMBConvUTF16straight wxMBConvUTF16BE
1402 #define wxMBConvUTF16swap wxMBConvUTF16LE
1404 #define wxMBConvUTF16swap wxMBConvUTF16BE
1405 #define wxMBConvUTF16straight wxMBConvUTF16LE
1409 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1411 if ( srcLen
== wxNO_LEN
)
1413 // count the number of bytes in input, including the trailing NULs
1414 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1415 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1418 srcLen
*= BYTES_PER_CHAR
;
1420 else // we already have the length
1422 // we can only convert an entire number of UTF-16 characters
1423 if ( srcLen
% BYTES_PER_CHAR
)
1424 return wxCONV_FAILED
;
1430 // case when in-memory representation is UTF-16 too
1433 // ----------------------------------------------------------------------------
1434 // conversions without endianness change
1435 // ----------------------------------------------------------------------------
1438 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1439 const char *src
, size_t srcLen
) const
1441 // set up the scene for using memcpy() (which is presumably more efficient
1442 // than copying the bytes one by one)
1443 srcLen
= GetLength(src
, srcLen
);
1444 if ( srcLen
== wxNO_LEN
)
1445 return wxCONV_FAILED
;
1447 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1450 if ( dstLen
< inLen
)
1451 return wxCONV_FAILED
;
1453 memcpy(dst
, src
, srcLen
);
1460 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1461 const wchar_t *src
, size_t srcLen
) const
1463 if ( srcLen
== wxNO_LEN
)
1464 srcLen
= wxWcslen(src
) + 1;
1466 srcLen
*= BYTES_PER_CHAR
;
1470 if ( dstLen
< srcLen
)
1471 return wxCONV_FAILED
;
1473 memcpy(dst
, src
, srcLen
);
1479 // ----------------------------------------------------------------------------
1480 // endian-reversing conversions
1481 // ----------------------------------------------------------------------------
1484 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1485 const char *src
, size_t srcLen
) const
1487 srcLen
= GetLength(src
, srcLen
);
1488 if ( srcLen
== wxNO_LEN
)
1489 return wxCONV_FAILED
;
1491 srcLen
/= BYTES_PER_CHAR
;
1495 if ( dstLen
< srcLen
)
1496 return wxCONV_FAILED
;
1498 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1499 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1501 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1509 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1510 const wchar_t *src
, size_t srcLen
) const
1512 if ( srcLen
== wxNO_LEN
)
1513 srcLen
= wxWcslen(src
) + 1;
1515 srcLen
*= BYTES_PER_CHAR
;
1519 if ( dstLen
< srcLen
)
1520 return wxCONV_FAILED
;
1522 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1523 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1525 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1532 #else // !WC_UTF16: wchar_t is UTF-32
1534 // ----------------------------------------------------------------------------
1535 // conversions without endianness change
1536 // ----------------------------------------------------------------------------
1539 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1540 const char *src
, size_t srcLen
) const
1542 srcLen
= GetLength(src
, srcLen
);
1543 if ( srcLen
== wxNO_LEN
)
1544 return wxCONV_FAILED
;
1546 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1549 // optimization: return maximal space which could be needed for this
1550 // string even if the real size could be smaller if the buffer contains
1556 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1557 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1559 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1561 return wxCONV_FAILED
;
1563 if ( ++outLen
> dstLen
)
1564 return wxCONV_FAILED
;
1574 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1575 const wchar_t *src
, size_t srcLen
) const
1577 if ( srcLen
== wxNO_LEN
)
1578 srcLen
= wxWcslen(src
) + 1;
1581 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1582 for ( size_t n
= 0; n
< srcLen
; n
++ )
1585 const size_t numChars
= encode_utf16(*src
++, cc
);
1586 if ( numChars
== wxCONV_FAILED
)
1587 return wxCONV_FAILED
;
1589 outLen
+= numChars
* BYTES_PER_CHAR
;
1592 if ( outLen
> dstLen
)
1593 return wxCONV_FAILED
;
1596 if ( numChars
== 2 )
1598 // second character of a surrogate
1607 // ----------------------------------------------------------------------------
1608 // endian-reversing conversions
1609 // ----------------------------------------------------------------------------
1612 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1613 const char *src
, size_t srcLen
) const
1615 srcLen
= GetLength(src
, srcLen
);
1616 if ( srcLen
== wxNO_LEN
)
1617 return wxCONV_FAILED
;
1619 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1622 // optimization: return maximal space which could be needed for this
1623 // string even if the real size could be smaller if the buffer contains
1629 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1630 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1635 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1637 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1639 const size_t numChars
= decode_utf16(tmp
, ch
);
1640 if ( numChars
== wxCONV_FAILED
)
1641 return wxCONV_FAILED
;
1643 if ( numChars
== 2 )
1646 if ( ++outLen
> dstLen
)
1647 return wxCONV_FAILED
;
1657 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1658 const wchar_t *src
, size_t srcLen
) const
1660 if ( srcLen
== wxNO_LEN
)
1661 srcLen
= wxWcslen(src
) + 1;
1664 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1665 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1668 const size_t numChars
= encode_utf16(*src
, cc
);
1669 if ( numChars
== wxCONV_FAILED
)
1670 return wxCONV_FAILED
;
1672 outLen
+= numChars
* BYTES_PER_CHAR
;
1675 if ( outLen
> dstLen
)
1676 return wxCONV_FAILED
;
1678 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1679 if ( numChars
== 2 )
1681 // second character of a surrogate
1682 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1690 #endif // WC_UTF16/!WC_UTF16
1693 // ============================================================================
1695 // ============================================================================
1697 #ifdef WORDS_BIGENDIAN
1698 #define wxMBConvUTF32straight wxMBConvUTF32BE
1699 #define wxMBConvUTF32swap wxMBConvUTF32LE
1701 #define wxMBConvUTF32swap wxMBConvUTF32BE
1702 #define wxMBConvUTF32straight wxMBConvUTF32LE
1706 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1707 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1710 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1712 if ( srcLen
== wxNO_LEN
)
1714 // count the number of bytes in input, including the trailing NULs
1715 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1716 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1719 srcLen
*= BYTES_PER_CHAR
;
1721 else // we already have the length
1723 // we can only convert an entire number of UTF-32 characters
1724 if ( srcLen
% BYTES_PER_CHAR
)
1725 return wxCONV_FAILED
;
1731 // case when in-memory representation is UTF-16
1734 // ----------------------------------------------------------------------------
1735 // conversions without endianness change
1736 // ----------------------------------------------------------------------------
1739 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1740 const char *src
, size_t srcLen
) const
1742 srcLen
= GetLength(src
, srcLen
);
1743 if ( srcLen
== wxNO_LEN
)
1744 return wxCONV_FAILED
;
1746 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1747 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1749 for ( size_t n
= 0; n
< inLen
; n
++ )
1752 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1753 if ( numChars
== wxCONV_FAILED
)
1754 return wxCONV_FAILED
;
1759 if ( outLen
> dstLen
)
1760 return wxCONV_FAILED
;
1763 if ( numChars
== 2 )
1765 // second character of a surrogate
1775 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1776 const wchar_t *src
, size_t srcLen
) const
1778 if ( srcLen
== wxNO_LEN
)
1779 srcLen
= wxWcslen(src
) + 1;
1783 // optimization: return maximal space which could be needed for this
1784 // string instead of the exact amount which could be less if there are
1785 // any surrogates in the input
1787 // we consider that surrogates are rare enough to make it worthwhile to
1788 // avoid running the loop below at the cost of slightly extra memory
1790 return srcLen
* BYTES_PER_CHAR
;
1793 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1795 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1797 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1799 return wxCONV_FAILED
;
1801 outLen
+= BYTES_PER_CHAR
;
1803 if ( outLen
> dstLen
)
1804 return wxCONV_FAILED
;
1812 // ----------------------------------------------------------------------------
1813 // endian-reversing conversions
1814 // ----------------------------------------------------------------------------
1817 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1818 const char *src
, size_t srcLen
) const
1820 srcLen
= GetLength(src
, srcLen
);
1821 if ( srcLen
== wxNO_LEN
)
1822 return wxCONV_FAILED
;
1824 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1825 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1827 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1830 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1831 if ( numChars
== wxCONV_FAILED
)
1832 return wxCONV_FAILED
;
1837 if ( outLen
> dstLen
)
1838 return wxCONV_FAILED
;
1841 if ( numChars
== 2 )
1843 // second character of a surrogate
1853 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1854 const wchar_t *src
, size_t srcLen
) const
1856 if ( srcLen
== wxNO_LEN
)
1857 srcLen
= wxWcslen(src
) + 1;
1861 // optimization: return maximal space which could be needed for this
1862 // string instead of the exact amount which could be less if there are
1863 // any surrogates in the input
1865 // we consider that surrogates are rare enough to make it worthwhile to
1866 // avoid running the loop below at the cost of slightly extra memory
1868 return srcLen
*BYTES_PER_CHAR
;
1871 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1873 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1875 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1877 return wxCONV_FAILED
;
1879 outLen
+= BYTES_PER_CHAR
;
1881 if ( outLen
> dstLen
)
1882 return wxCONV_FAILED
;
1884 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1890 #else // !WC_UTF16: wchar_t is UTF-32
1892 // ----------------------------------------------------------------------------
1893 // conversions without endianness change
1894 // ----------------------------------------------------------------------------
1897 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1898 const char *src
, size_t srcLen
) const
1900 // use memcpy() as it should be much faster than hand-written loop
1901 srcLen
= GetLength(src
, srcLen
);
1902 if ( srcLen
== wxNO_LEN
)
1903 return wxCONV_FAILED
;
1905 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1908 if ( dstLen
< inLen
)
1909 return wxCONV_FAILED
;
1911 memcpy(dst
, src
, srcLen
);
1918 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1919 const wchar_t *src
, size_t srcLen
) const
1921 if ( srcLen
== wxNO_LEN
)
1922 srcLen
= wxWcslen(src
) + 1;
1924 srcLen
*= BYTES_PER_CHAR
;
1928 if ( dstLen
< srcLen
)
1929 return wxCONV_FAILED
;
1931 memcpy(dst
, src
, srcLen
);
1937 // ----------------------------------------------------------------------------
1938 // endian-reversing conversions
1939 // ----------------------------------------------------------------------------
1942 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1943 const char *src
, size_t srcLen
) const
1945 srcLen
= GetLength(src
, srcLen
);
1946 if ( srcLen
== wxNO_LEN
)
1947 return wxCONV_FAILED
;
1949 srcLen
/= BYTES_PER_CHAR
;
1953 if ( dstLen
< srcLen
)
1954 return wxCONV_FAILED
;
1956 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1957 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1959 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1967 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1968 const wchar_t *src
, size_t srcLen
) const
1970 if ( srcLen
== wxNO_LEN
)
1971 srcLen
= wxWcslen(src
) + 1;
1973 srcLen
*= BYTES_PER_CHAR
;
1977 if ( dstLen
< srcLen
)
1978 return wxCONV_FAILED
;
1980 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1981 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1983 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1990 #endif // WC_UTF16/!WC_UTF16
1993 // ============================================================================
1994 // The classes doing conversion using the iconv_xxx() functions
1995 // ============================================================================
1999 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2000 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
2001 // (unless there's yet another bug in glibc) the only case when iconv()
2002 // returns with (size_t)-1 (which means error) and says there are 0 bytes
2003 // left in the input buffer -- when _real_ error occurs,
2004 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2006 // [This bug does not appear in glibc 2.2.]
2007 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2008 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2009 (errno != E2BIG || bufLeft != 0))
2011 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
2014 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
2016 #define ICONV_T_INVALID ((iconv_t)-1)
2018 #if SIZEOF_WCHAR_T == 4
2019 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
2020 #define WC_ENC wxFONTENCODING_UTF32
2021 #elif SIZEOF_WCHAR_T == 2
2022 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
2023 #define WC_ENC wxFONTENCODING_UTF16
2024 #else // sizeof(wchar_t) != 2 nor 4
2025 // does this ever happen?
2026 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2029 // ----------------------------------------------------------------------------
2030 // wxMBConv_iconv: encapsulates an iconv character set
2031 // ----------------------------------------------------------------------------
2033 class wxMBConv_iconv
: public wxMBConv
2036 wxMBConv_iconv(const char *name
);
2037 virtual ~wxMBConv_iconv();
2039 // implement base class virtual methods
2040 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
2041 const char *src
, size_t srcLen
= wxNO_LEN
) const;
2042 virtual size_t FromWChar(char *dst
, size_t dstLen
,
2043 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
2044 virtual size_t GetMBNulLen() const;
2046 #if wxUSE_UNICODE_UTF8
2047 virtual bool IsUTF8() const;
2050 virtual wxMBConv
*Clone() const
2052 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
2053 p
->m_minMBCharWidth
= m_minMBCharWidth
;
2058 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
2061 // the iconv handlers used to translate from multibyte
2062 // to wide char and in the other direction
2067 // guards access to m2w and w2m objects
2068 wxMutex m_iconvMutex
;
2072 // the name (for iconv_open()) of a wide char charset -- if none is
2073 // available on this machine, it will remain NULL
2074 static wxString ms_wcCharsetName
;
2076 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2077 // different endian-ness than the native one
2078 static bool ms_wcNeedsSwap
;
2081 // name of the encoding handled by this conversion
2084 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2086 size_t m_minMBCharWidth
;
2089 // make the constructor available for unit testing
2090 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2092 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2093 if ( !result
->IsOk() )
2102 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2103 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2105 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2108 m_minMBCharWidth
= 0;
2110 // check for charset that represents wchar_t:
2111 if ( ms_wcCharsetName
.empty() )
2113 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
2116 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2117 #else // !wxUSE_FONTMAP
2118 static const wxChar
*names_static
[] =
2120 #if SIZEOF_WCHAR_T == 4
2122 #elif SIZEOF_WCHAR_T = 2
2127 const wxChar
**names
= names_static
;
2128 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2130 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2132 const wxString
nameCS(*names
);
2134 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2135 wxString
nameXE(nameCS
);
2137 #ifdef WORDS_BIGENDIAN
2139 #else // little endian
2143 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2146 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2147 if ( m2w
== ICONV_T_INVALID
)
2149 // try charset w/o bytesex info (e.g. "UCS4")
2150 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2152 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2154 // and check for bytesex ourselves:
2155 if ( m2w
!= ICONV_T_INVALID
)
2157 char buf
[2], *bufPtr
;
2166 outsz
= SIZEOF_WCHAR_T
* 2;
2167 char* wbufPtr
= (char*)wbuf
;
2171 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2174 if (ICONV_FAILED(res
, insz
))
2176 wxLogLastError(wxT("iconv"));
2177 wxLogError(_("Conversion to charset '%s' doesn't work."),
2180 else // ok, can convert to this encoding, remember it
2182 ms_wcCharsetName
= nameCS
;
2183 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2187 else // use charset not requiring byte swapping
2189 ms_wcCharsetName
= nameXE
;
2193 wxLogTrace(TRACE_STRCONV
,
2194 wxT("iconv wchar_t charset is \"%s\"%s"),
2195 ms_wcCharsetName
.empty() ? wxString("<none>")
2197 ms_wcNeedsSwap
? _T(" (needs swap)")
2200 else // we already have ms_wcCharsetName
2202 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2205 if ( ms_wcCharsetName
.empty() )
2207 w2m
= ICONV_T_INVALID
;
2211 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2212 if ( w2m
== ICONV_T_INVALID
)
2214 wxLogTrace(TRACE_STRCONV
,
2215 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2216 ms_wcCharsetName
.c_str(), name
);
2221 wxMBConv_iconv::~wxMBConv_iconv()
2223 if ( m2w
!= ICONV_T_INVALID
)
2225 if ( w2m
!= ICONV_T_INVALID
)
2230 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2231 const char *src
, size_t srcLen
) const
2233 if ( srcLen
== wxNO_LEN
)
2235 // find the string length: notice that must be done differently for
2236 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2238 const size_t nulLen
= GetMBNulLen();
2242 return wxCONV_FAILED
;
2245 srcLen
= strlen(src
); // arguably more optimized than our version
2250 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2251 // but they also have to start at character boundary and not
2252 // span two adjacent characters
2254 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2260 // when we're determining the length of the string ourselves we count
2261 // the terminating NUL(s) as part of it and always NUL-terminate the
2266 // we express length in the number of (wide) characters but iconv always
2267 // counts buffer sizes it in bytes
2268 dstLen
*= SIZEOF_WCHAR_T
;
2271 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2272 // Unfortunately there are a couple of global wxCSConv objects such as
2273 // wxConvLocal that are used all over wx code, so we have to make sure
2274 // the handle is used by at most one thread at the time. Otherwise
2275 // only a few wx classes would be safe to use from non-main threads
2276 // as MB<->WC conversion would fail "randomly".
2277 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2278 #endif // wxUSE_THREADS
2281 const char *pszPtr
= src
;
2285 char* bufPtr
= (char*)dst
;
2287 // have destination buffer, convert there
2288 size_t dstLenOrig
= dstLen
;
2290 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2293 // convert the number of bytes converted as returned by iconv to the
2294 // number of (wide) characters converted that we need
2295 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2299 // convert to native endianness
2300 for ( unsigned i
= 0; i
< res
; i
++ )
2301 dst
[i
] = WC_BSWAP(dst
[i
]);
2304 else // no destination buffer
2306 // convert using temp buffer to calculate the size of the buffer needed
2312 char* bufPtr
= (char*)tbuf
;
2313 dstLen
= 8 * SIZEOF_WCHAR_T
;
2316 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2319 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2321 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2324 if (ICONV_FAILED(cres
, srcLen
))
2326 //VS: it is ok if iconv fails, hence trace only
2327 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2328 return wxCONV_FAILED
;
2334 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2335 const wchar_t *src
, size_t srcLen
) const
2338 // NB: explained in MB2WC
2339 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2342 if ( srcLen
== wxNO_LEN
)
2343 srcLen
= wxWcslen(src
) + 1;
2345 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2346 size_t outbuflen
= dstLen
;
2349 wchar_t *tmpbuf
= 0;
2353 // need to copy to temp buffer to switch endianness
2354 // (doing WC_BSWAP twice on the original buffer won't help, as it
2355 // could be in read-only memory, or be accessed in some other thread)
2356 tmpbuf
= (wchar_t *)malloc(inbuflen
+ SIZEOF_WCHAR_T
);
2357 for ( size_t i
= 0; i
< srcLen
; i
++ )
2358 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2360 tmpbuf
[srcLen
] = L
'\0';
2364 char* inbuf
= (char*)src
;
2367 // have destination buffer, convert there
2368 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2370 res
= dstLen
- outbuflen
;
2372 else // no destination buffer
2374 // convert using temp buffer to calculate the size of the buffer needed
2382 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2384 res
+= 16 - outbuflen
;
2386 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2394 if (ICONV_FAILED(cres
, inbuflen
))
2396 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2397 return wxCONV_FAILED
;
2403 size_t wxMBConv_iconv::GetMBNulLen() const
2405 if ( m_minMBCharWidth
== 0 )
2407 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2410 // NB: explained in MB2WC
2411 wxMutexLocker
lock(self
->m_iconvMutex
);
2414 const wchar_t *wnul
= L
"";
2415 char buf
[8]; // should be enough for NUL in any encoding
2416 size_t inLen
= sizeof(wchar_t),
2417 outLen
= WXSIZEOF(buf
);
2418 char *inBuff
= (char *)wnul
;
2419 char *outBuff
= buf
;
2420 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2422 self
->m_minMBCharWidth
= (size_t)-1;
2426 self
->m_minMBCharWidth
= outBuff
- buf
;
2430 return m_minMBCharWidth
;
2433 #if wxUSE_UNICODE_UTF8
2434 bool wxMBConv_iconv::IsUTF8() const
2436 return wxStricmp(m_name
, "UTF-8") == 0 ||
2437 wxStricmp(m_name
, "UTF8") == 0;
2441 #endif // HAVE_ICONV
2444 // ============================================================================
2445 // Win32 conversion classes
2446 // ============================================================================
2448 #ifdef wxHAVE_WIN32_MB2WC
2452 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2453 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2456 class wxMBConv_win32
: public wxMBConv
2461 m_CodePage
= CP_ACP
;
2462 m_minMBCharWidth
= 0;
2465 wxMBConv_win32(const wxMBConv_win32
& conv
)
2468 m_CodePage
= conv
.m_CodePage
;
2469 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2473 wxMBConv_win32(const char* name
)
2475 m_CodePage
= wxCharsetToCodepage(name
);
2476 m_minMBCharWidth
= 0;
2479 wxMBConv_win32(wxFontEncoding encoding
)
2481 m_CodePage
= wxEncodingToCodepage(encoding
);
2482 m_minMBCharWidth
= 0;
2484 #endif // wxUSE_FONTMAP
2486 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2488 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2489 // the behaviour is not compatible with the Unix version (using iconv)
2490 // and break the library itself, e.g. wxTextInputStream::NextChar()
2491 // wouldn't work if reading an incomplete MB char didn't result in an
2494 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2495 // Win XP or newer and it is not supported for UTF-[78] so we always
2496 // use our own conversions in this case. See
2497 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2498 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2499 if ( m_CodePage
== CP_UTF8
)
2501 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2504 if ( m_CodePage
== CP_UTF7
)
2506 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2510 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2511 IsAtLeastWin2kSP4() )
2513 flags
= MB_ERR_INVALID_CHARS
;
2516 const size_t len
= ::MultiByteToWideChar
2518 m_CodePage
, // code page
2519 flags
, // flags: fall on error
2520 psz
, // input string
2521 -1, // its length (NUL-terminated)
2522 buf
, // output string
2523 buf
? n
: 0 // size of output buffer
2527 // function totally failed
2528 return wxCONV_FAILED
;
2531 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2532 // check if we succeeded, by doing a double trip:
2533 if ( !flags
&& buf
)
2535 const size_t mbLen
= strlen(psz
);
2536 wxCharBuffer
mbBuf(mbLen
);
2537 if ( ::WideCharToMultiByte
2544 mbLen
+ 1, // size in bytes, not length
2548 strcmp(mbBuf
, psz
) != 0 )
2550 // we didn't obtain the same thing we started from, hence
2551 // the conversion was lossy and we consider that it failed
2552 return wxCONV_FAILED
;
2556 // note that it returns count of written chars for buf != NULL and size
2557 // of the needed buffer for buf == NULL so in either case the length of
2558 // the string (which never includes the terminating NUL) is one less
2562 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2565 we have a problem here: by default, WideCharToMultiByte() may
2566 replace characters unrepresentable in the target code page with bad
2567 quality approximations such as turning "1/2" symbol (U+00BD) into
2568 "1" for the code pages which don't have it and we, obviously, want
2569 to avoid this at any price
2571 the trouble is that this function does it _silently_, i.e. it won't
2572 even tell us whether it did or not... Win98/2000 and higher provide
2573 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2574 we have to resort to a round trip, i.e. check that converting back
2575 results in the same string -- this is, of course, expensive but
2576 otherwise we simply can't be sure to not garble the data.
2579 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2580 // it doesn't work with CJK encodings (which we test for rather roughly
2581 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2583 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2586 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2588 // it's our lucky day
2589 flags
= WC_NO_BEST_FIT_CHARS
;
2590 pUsedDef
= &usedDef
;
2592 else // old system or unsupported encoding
2598 const size_t len
= ::WideCharToMultiByte
2600 m_CodePage
, // code page
2601 flags
, // either none or no best fit
2602 pwz
, // input string
2603 -1, // it is (wide) NUL-terminated
2604 buf
, // output buffer
2605 buf
? n
: 0, // and its size
2606 NULL
, // default "replacement" char
2607 pUsedDef
// [out] was it used?
2612 // function totally failed
2613 return wxCONV_FAILED
;
2616 // we did something, check if we really succeeded
2619 // check if the conversion failed, i.e. if any replacements
2622 return wxCONV_FAILED
;
2624 else // we must resort to double tripping...
2626 // first we need to ensure that we really have the MB data: this is
2627 // not the case if we're called with NULL buffer, in which case we
2628 // need to do the conversion yet again
2629 wxCharBuffer bufDef
;
2632 bufDef
= wxCharBuffer(len
);
2633 buf
= bufDef
.data();
2634 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2635 buf
, len
, NULL
, NULL
) )
2636 return wxCONV_FAILED
;
2641 wxWCharBuffer
wcBuf(n
);
2642 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2643 wcscmp(wcBuf
, pwz
) != 0 )
2645 // we didn't obtain the same thing we started from, hence
2646 // the conversion was lossy and we consider that it failed
2647 return wxCONV_FAILED
;
2651 // see the comment above for the reason of "len - 1"
2655 virtual size_t GetMBNulLen() const
2657 if ( m_minMBCharWidth
== 0 )
2659 int len
= ::WideCharToMultiByte
2661 m_CodePage
, // code page
2663 L
"", // input string
2664 1, // translate just the NUL
2665 NULL
, // output buffer
2667 NULL
, // no replacement char
2668 NULL
// [out] don't care if it was used
2671 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2675 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2676 self
->m_minMBCharWidth
= (size_t)-1;
2680 self
->m_minMBCharWidth
= (size_t)-1;
2686 self
->m_minMBCharWidth
= len
;
2691 return m_minMBCharWidth
;
2694 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2696 bool IsOk() const { return m_CodePage
!= -1; }
2699 static bool CanUseNoBestFit()
2701 static int s_isWin98Or2k
= -1;
2703 if ( s_isWin98Or2k
== -1 )
2706 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2708 case wxOS_WINDOWS_9X
:
2709 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2712 case wxOS_WINDOWS_NT
:
2713 s_isWin98Or2k
= verMaj
>= 5;
2717 // unknown: be conservative by default
2722 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2725 return s_isWin98Or2k
== 1;
2728 static bool IsAtLeastWin2kSP4()
2733 static int s_isAtLeastWin2kSP4
= -1;
2735 if ( s_isAtLeastWin2kSP4
== -1 )
2737 OSVERSIONINFOEX ver
;
2739 memset(&ver
, 0, sizeof(ver
));
2740 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2741 GetVersionEx((OSVERSIONINFO
*)&ver
);
2743 s_isAtLeastWin2kSP4
=
2744 ((ver
.dwMajorVersion
> 5) || // Vista+
2745 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2746 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2747 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2751 return s_isAtLeastWin2kSP4
== 1;
2756 // the code page we're working with
2759 // cached result of GetMBNulLen(), set to 0 initially meaning
2761 size_t m_minMBCharWidth
;
2764 #endif // wxHAVE_WIN32_MB2WC
2767 // ============================================================================
2768 // wxEncodingConverter based conversion classes
2769 // ============================================================================
2773 class wxMBConv_wxwin
: public wxMBConv
2778 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2779 // The wxMBConv_cf class does a better job.
2780 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2781 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2782 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2786 // temporarily just use wxEncodingConverter stuff,
2787 // so that it works while a better implementation is built
2788 wxMBConv_wxwin(const char* name
)
2791 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2793 m_enc
= wxFONTENCODING_SYSTEM
;
2798 wxMBConv_wxwin(wxFontEncoding enc
)
2805 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2807 size_t inbuf
= strlen(psz
);
2810 if (!m2w
.Convert(psz
, buf
))
2811 return wxCONV_FAILED
;
2816 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2818 const size_t inbuf
= wxWcslen(psz
);
2821 if (!w2m
.Convert(psz
, buf
))
2822 return wxCONV_FAILED
;
2828 virtual size_t GetMBNulLen() const
2832 case wxFONTENCODING_UTF16BE
:
2833 case wxFONTENCODING_UTF16LE
:
2836 case wxFONTENCODING_UTF32BE
:
2837 case wxFONTENCODING_UTF32LE
:
2845 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2847 bool IsOk() const { return m_ok
; }
2850 wxFontEncoding m_enc
;
2851 wxEncodingConverter m2w
, w2m
;
2854 // were we initialized successfully?
2857 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2860 // make the constructors available for unit testing
2861 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2863 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2864 if ( !result
->IsOk() )
2873 #endif // wxUSE_FONTMAP
2875 // ============================================================================
2876 // wxCSConv implementation
2877 // ============================================================================
2879 void wxCSConv::Init()
2886 wxCSConv::wxCSConv(const wxString
& charset
)
2890 if ( !charset
.empty() )
2892 SetName(charset
.ToAscii());
2896 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2897 if ( m_encoding
== wxFONTENCODING_MAX
)
2899 // set to unknown/invalid value
2900 m_encoding
= wxFONTENCODING_SYSTEM
;
2902 else if ( m_encoding
== wxFONTENCODING_DEFAULT
)
2904 // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2905 m_encoding
= wxFONTENCODING_ISO8859_1
;
2908 m_encoding
= wxFONTENCODING_SYSTEM
;
2912 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2914 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2916 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2918 encoding
= wxFONTENCODING_SYSTEM
;
2923 m_encoding
= encoding
;
2926 wxCSConv::~wxCSConv()
2931 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2936 SetName(conv
.m_name
);
2937 m_encoding
= conv
.m_encoding
;
2940 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2944 SetName(conv
.m_name
);
2945 m_encoding
= conv
.m_encoding
;
2950 void wxCSConv::Clear()
2959 void wxCSConv::SetName(const char *charset
)
2963 m_name
= wxStrdup(charset
);
2970 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2971 wxEncodingNameCache
);
2973 static wxEncodingNameCache gs_nameCache
;
2976 wxMBConv
*wxCSConv::DoCreate() const
2979 wxLogTrace(TRACE_STRCONV
,
2980 wxT("creating conversion for %s"),
2982 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2983 #endif // wxUSE_FONTMAP
2985 // check for the special case of ASCII or ISO8859-1 charset: as we have
2986 // special knowledge of it anyhow, we don't need to create a special
2987 // conversion object
2988 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2989 m_encoding
== wxFONTENCODING_DEFAULT
)
2991 // don't convert at all
2995 // we trust OS to do conversion better than we can so try external
2996 // conversion methods first
2998 // the full order is:
2999 // 1. OS conversion (iconv() under Unix or Win32 API)
3000 // 2. hard coded conversions for UTF
3001 // 3. wxEncodingConverter as fall back
3007 #endif // !wxUSE_FONTMAP
3010 wxFontEncoding
encoding(m_encoding
);
3015 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
3023 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3024 #endif // wxUSE_FONTMAP
3028 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
3029 if ( it
!= gs_nameCache
.end() )
3031 if ( it
->second
.empty() )
3034 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
3041 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
3042 // CS : in case this does not return valid names (eg for MacRoman)
3043 // encoding got a 'failure' entry in the cache all the same,
3044 // although it just has to be created using a different method, so
3045 // only store failed iconv creation attempts (or perhaps we
3046 // shoulnd't do this at all ?)
3047 if ( names
[0] != NULL
)
3049 for ( ; *names
; ++names
)
3051 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3052 // will need changes that will obsolete this
3053 wxString
name(*names
);
3054 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
3057 gs_nameCache
[encoding
] = *names
;
3064 gs_nameCache
[encoding
] = _T(""); // cache the failure
3067 #endif // wxUSE_FONTMAP
3069 #endif // HAVE_ICONV
3071 #ifdef wxHAVE_WIN32_MB2WC
3074 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3075 : new wxMBConv_win32(m_encoding
);
3084 #endif // wxHAVE_WIN32_MB2WC
3088 // leave UTF16 and UTF32 to the built-ins of wx
3089 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3090 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3093 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3094 : new wxMBConv_cf(m_encoding
);
3096 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3105 #endif // __DARWIN__
3108 wxFontEncoding enc
= m_encoding
;
3110 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3112 // use "false" to suppress interactive dialogs -- we can be called from
3113 // anywhere and popping up a dialog from here is the last thing we want to
3115 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3117 #endif // wxUSE_FONTMAP
3121 case wxFONTENCODING_UTF7
:
3122 return new wxMBConvUTF7
;
3124 case wxFONTENCODING_UTF8
:
3125 return new wxMBConvUTF8
;
3127 case wxFONTENCODING_UTF16BE
:
3128 return new wxMBConvUTF16BE
;
3130 case wxFONTENCODING_UTF16LE
:
3131 return new wxMBConvUTF16LE
;
3133 case wxFONTENCODING_UTF32BE
:
3134 return new wxMBConvUTF32BE
;
3136 case wxFONTENCODING_UTF32LE
:
3137 return new wxMBConvUTF32LE
;
3140 // nothing to do but put here to suppress gcc warnings
3147 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3148 : new wxMBConv_wxwin(m_encoding
);
3154 #endif // wxUSE_FONTMAP
3156 // NB: This is a hack to prevent deadlock. What could otherwise happen
3157 // in Unicode build: wxConvLocal creation ends up being here
3158 // because of some failure and logs the error. But wxLog will try to
3159 // attach a timestamp, for which it will need wxConvLocal (to convert
3160 // time to char* and then wchar_t*), but that fails, tries to log the
3161 // error, but wxLog has an (already locked) critical section that
3162 // guards the static buffer.
3163 static bool alreadyLoggingError
= false;
3164 if (!alreadyLoggingError
)
3166 alreadyLoggingError
= true;
3167 wxLogError(_("Cannot convert from the charset '%s'!"),
3171 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
3172 #else // !wxUSE_FONTMAP
3173 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
3174 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3177 alreadyLoggingError
= false;
3183 void wxCSConv::CreateConvIfNeeded() const
3187 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3189 // if we don't have neither the name nor the encoding, use the default
3190 // encoding for this system
3191 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3194 self
->m_encoding
= wxLocale::GetSystemEncoding();
3196 // fallback to some reasonable default:
3197 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3198 #endif // wxUSE_INTL
3201 self
->m_convReal
= DoCreate();
3202 self
->m_deferred
= false;
3206 bool wxCSConv::IsOk() const
3208 CreateConvIfNeeded();
3210 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3211 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3212 return true; // always ok as we do it ourselves
3214 // m_convReal->IsOk() is called at its own creation, so we know it must
3215 // be ok if m_convReal is non-NULL
3216 return m_convReal
!= NULL
;
3219 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3220 const char *src
, size_t srcLen
) const
3222 CreateConvIfNeeded();
3225 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3228 if ( srcLen
== wxNO_LEN
)
3229 srcLen
= strlen(src
) + 1; // take trailing NUL too
3233 if ( dstLen
< srcLen
)
3234 return wxCONV_FAILED
;
3236 for ( size_t n
= 0; n
< srcLen
; n
++ )
3237 dst
[n
] = (unsigned char)(src
[n
]);
3243 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3244 const wchar_t *src
, size_t srcLen
) const
3246 CreateConvIfNeeded();
3249 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3252 if ( srcLen
== wxNO_LEN
)
3253 srcLen
= wxWcslen(src
) + 1;
3257 if ( dstLen
< srcLen
)
3258 return wxCONV_FAILED
;
3260 for ( size_t n
= 0; n
< srcLen
; n
++ )
3262 if ( src
[n
] > 0xFF )
3263 return wxCONV_FAILED
;
3265 dst
[n
] = (char)src
[n
];
3269 else // still need to check the input validity
3271 for ( size_t n
= 0; n
< srcLen
; n
++ )
3273 if ( src
[n
] > 0xFF )
3274 return wxCONV_FAILED
;
3281 size_t wxCSConv::GetMBNulLen() const
3283 CreateConvIfNeeded();
3287 return m_convReal
->GetMBNulLen();
3290 // otherwise, we are ISO-8859-1
3294 #if wxUSE_UNICODE_UTF8
3295 bool wxCSConv::IsUTF8() const
3297 CreateConvIfNeeded();
3301 return m_convReal
->IsUTF8();
3304 // otherwise, we are ISO-8859-1
3312 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3315 return wxWCharBuffer();
3317 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3319 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3321 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3326 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3329 return wxCharBuffer();
3331 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3333 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3338 #endif // wxUSE_UNICODE
3340 // ----------------------------------------------------------------------------
3342 // ----------------------------------------------------------------------------
3344 // NB: The reason why we create converted objects in this convoluted way,
3345 // using a factory function instead of global variable, is that they
3346 // may be used at static initialization time (some of them are used by
3347 // wxString ctors and there may be a global wxString object). In other
3348 // words, possibly _before_ the converter global object would be
3355 #undef wxConvISO8859_1
3357 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3358 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3359 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3361 static impl_klass name##Obj ctor_args; \
3362 return &name##Obj; \
3364 /* this ensures that all global converter objects are created */ \
3365 /* by the time static initialization is done, i.e. before any */ \
3366 /* thread is launched: */ \
3367 static klass* gs_##name##instance = wxGet_##name##Ptr()
3369 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3370 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3373 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3375 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3378 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3379 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3380 // provokes an error message about "not enough macro parameters"; and we
3381 // can't use "()" here as the name##Obj declaration would be parsed as a
3382 // function declaration then, so use a semicolon and live with an extra
3383 // empty statement (and hope that no compilers warns about this)
3384 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3385 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3387 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3388 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3390 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3391 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3394 // The xnu kernel always communicates file paths in decomposed UTF-8.
3395 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3396 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3399 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3402 #else // !__DARWIN__
3403 wxGet_wxConvLibcPtr();
3404 #endif // __DARWIN__/!__DARWIN__
3406 #else // !wxUSE_WCHAR_T
3408 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3409 // stand-ins in absence of wchar_t
3410 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3415 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T