1 /////////////////////////////////////////////////////////////////////////////
2 // Name: src/common/strconv.cpp
3 // Purpose: Unicode conversion classes
4 // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5 // Ryan Norton, Fredrik Roubert (UTF7)
9 // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10 // (c) 2000-2003 Vadim Zeitlin
11 // (c) 2004 Ryan Norton, Fredrik Roubert
12 // Licence: wxWindows licence
13 /////////////////////////////////////////////////////////////////////////////
15 // For compilers that support precompilation, includes "wx.h".
16 #include "wx/wxprec.h"
26 #include "wx/hashmap.h"
29 #include "wx/strconv.h"
41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
42 #include "wx/msw/private.h"
43 #include "wx/msw/missing.h"
44 #define wxHAVE_WIN32_MB2WC
49 #include "wx/thread.h"
52 #include "wx/encconv.h"
53 #include "wx/fontmap.h"
56 #include "wx/mac/corefoundation/private/strconv_cf.h"
57 #endif //def __DARWIN__
60 #define TRACE_STRCONV _T("strconv")
62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
64 #if SIZEOF_WCHAR_T == 2
69 // ============================================================================
71 // ============================================================================
73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
74 static bool NotAllNULs(const char *p
, size_t n
)
76 while ( n
&& *p
++ == '\0' )
82 // ----------------------------------------------------------------------------
83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
84 // ----------------------------------------------------------------------------
86 static size_t encode_utf16(wxUint32 input
, wxUint16
*output
)
91 *output
= (wxUint16
) input
;
95 else if (input
>= 0x110000)
103 *output
++ = (wxUint16
) ((input
>> 10) + 0xd7c0);
104 *output
= (wxUint16
) ((input
& 0x3ff) + 0xdc00);
111 static size_t decode_utf16(const wxUint16
* input
, wxUint32
& output
)
113 if ((*input
< 0xd800) || (*input
> 0xdfff))
118 else if ((input
[1] < 0xdc00) || (input
[1] > 0xdfff))
121 return wxCONV_FAILED
;
125 output
= ((input
[0] - 0xd7c0) << 10) + (input
[1] - 0xdc00);
131 typedef wchar_t wxDecodeSurrogate_t
;
133 typedef wxUint16 wxDecodeSurrogate_t
;
134 #endif // WC_UTF16/!WC_UTF16
136 // returns the next UTF-32 character from the wchar_t buffer and advances the
137 // pointer to the character after this one
139 // if an invalid character is found, *pSrc is set to NULL, the caller must
141 static wxUint32
wxDecodeSurrogate(const wxDecodeSurrogate_t
**pSrc
)
145 n
= decode_utf16(wx_reinterpret_cast(const wxUint16
*, *pSrc
), out
);
146 if ( n
== wxCONV_FAILED
)
154 // ----------------------------------------------------------------------------
156 // ----------------------------------------------------------------------------
159 wxMBConv::ToWChar(wchar_t *dst
, size_t dstLen
,
160 const char *src
, size_t srcLen
) const
162 // although new conversion classes are supposed to implement this function
163 // directly, the existins ones only implement the old MB2WC() and so, to
164 // avoid to have to rewrite all conversion classes at once, we provide a
165 // default (but not efficient) implementation of this one in terms of the
166 // old function by copying the input to ensure that it's NUL-terminated and
167 // then using MB2WC() to convert it
169 // the number of chars [which would be] written to dst [if it were not NULL]
170 size_t dstWritten
= 0;
172 // the number of NULs terminating this string
173 size_t nulLen
= 0; // not really needed, but just to avoid warnings
175 // if we were not given the input size we just have to assume that the
176 // string is properly terminated as we have no way of knowing how long it
177 // is anyhow, but if we do have the size check whether there are enough
181 if ( srcLen
!= wxNO_LEN
)
183 // we need to know how to find the end of this string
184 nulLen
= GetMBNulLen();
185 if ( nulLen
== wxCONV_FAILED
)
186 return wxCONV_FAILED
;
188 // if there are enough NULs we can avoid the copy
189 if ( srcLen
< nulLen
|| NotAllNULs(src
+ srcLen
- nulLen
, nulLen
) )
191 // make a copy in order to properly NUL-terminate the string
192 bufTmp
= wxCharBuffer(srcLen
+ nulLen
- 1 /* 1 will be added */);
193 char * const p
= bufTmp
.data();
194 memcpy(p
, src
, srcLen
);
195 for ( char *s
= p
+ srcLen
; s
< p
+ srcLen
+ nulLen
; s
++ )
201 srcEnd
= src
+ srcLen
;
203 else // quit after the first loop iteration
210 // try to convert the current chunk
211 size_t lenChunk
= MB2WC(NULL
, src
, 0);
212 if ( lenChunk
== wxCONV_FAILED
)
213 return wxCONV_FAILED
;
215 lenChunk
++; // for the L'\0' at the end of this chunk
217 dstWritten
+= lenChunk
;
221 // nothing left in the input string, conversion succeeded
227 if ( dstWritten
> dstLen
)
228 return wxCONV_FAILED
;
230 if ( MB2WC(dst
, src
, lenChunk
) == wxCONV_FAILED
)
231 return wxCONV_FAILED
;
238 // we convert just one chunk in this case as this is the entire
243 // advance the input pointer past the end of this chunk
244 while ( NotAllNULs(src
, nulLen
) )
246 // notice that we must skip over multiple bytes here as we suppose
247 // that if NUL takes 2 or 4 bytes, then all the other characters do
248 // too and so if advanced by a single byte we might erroneously
249 // detect sequences of NUL bytes in the middle of the input
253 src
+= nulLen
; // skipping over its terminator as well
255 // note that ">=" (and not just "==") is needed here as the terminator
256 // we skipped just above could be inside or just after the buffer
257 // delimited by inEnd
266 wxMBConv::FromWChar(char *dst
, size_t dstLen
,
267 const wchar_t *src
, size_t srcLen
) const
269 // the number of chars [which would be] written to dst [if it were not NULL]
270 size_t dstWritten
= 0;
272 // make a copy of the input string unless it is already properly
275 // if we don't know its length we have no choice but to assume that it is,
276 // indeed, properly terminated
277 wxWCharBuffer bufTmp
;
278 if ( srcLen
== wxNO_LEN
)
280 srcLen
= wxWcslen(src
) + 1;
282 else if ( srcLen
!= 0 && src
[srcLen
- 1] != L
'\0' )
284 // make a copy in order to properly NUL-terminate the string
285 bufTmp
= wxWCharBuffer(srcLen
);
286 memcpy(bufTmp
.data(), src
, srcLen
* sizeof(wchar_t));
290 const size_t lenNul
= GetMBNulLen();
291 for ( const wchar_t * const srcEnd
= src
+ srcLen
;
293 src
+= wxWcslen(src
) + 1 /* skip L'\0' too */ )
295 // try to convert the current chunk
296 size_t lenChunk
= WC2MB(NULL
, src
, 0);
298 if ( lenChunk
== wxCONV_FAILED
)
299 return wxCONV_FAILED
;
302 dstWritten
+= lenChunk
;
306 if ( dstWritten
> dstLen
)
307 return wxCONV_FAILED
;
309 if ( WC2MB(dst
, src
, lenChunk
) == wxCONV_FAILED
)
310 return wxCONV_FAILED
;
319 size_t wxMBConv::MB2WC(wchar_t *outBuff
, const char *inBuff
, size_t outLen
) const
321 size_t rc
= ToWChar(outBuff
, outLen
, inBuff
);
322 if ( rc
!= wxCONV_FAILED
)
324 // ToWChar() returns the buffer length, i.e. including the trailing
325 // NUL, while this method doesn't take it into account
332 size_t wxMBConv::WC2MB(char *outBuff
, const wchar_t *inBuff
, size_t outLen
) const
334 size_t rc
= FromWChar(outBuff
, outLen
, inBuff
);
335 if ( rc
!= wxCONV_FAILED
)
343 wxMBConv::~wxMBConv()
345 // nothing to do here (necessary for Darwin linking probably)
348 const wxWCharBuffer
wxMBConv::cMB2WC(const char *psz
) const
352 // calculate the length of the buffer needed first
353 const size_t nLen
= ToWChar(NULL
, 0, psz
);
354 if ( nLen
!= wxCONV_FAILED
)
356 // now do the actual conversion
357 wxWCharBuffer
buf(nLen
- 1 /* +1 added implicitly */);
359 // +1 for the trailing NULL
360 if ( ToWChar(buf
.data(), nLen
, psz
) != wxCONV_FAILED
)
365 return wxWCharBuffer();
368 const wxCharBuffer
wxMBConv::cWC2MB(const wchar_t *pwz
) const
372 const size_t nLen
= FromWChar(NULL
, 0, pwz
);
373 if ( nLen
!= wxCONV_FAILED
)
375 wxCharBuffer
buf(nLen
- 1);
376 if ( FromWChar(buf
.data(), nLen
, pwz
) != wxCONV_FAILED
)
381 return wxCharBuffer();
385 wxMBConv::cMB2WC(const char *inBuff
, size_t inLen
, size_t *outLen
) const
387 const size_t dstLen
= ToWChar(NULL
, 0, inBuff
, inLen
);
388 if ( dstLen
!= wxCONV_FAILED
)
390 // notice that we allocate space for dstLen+1 wide characters here
391 // because we want the buffer to always be NUL-terminated, even if the
392 // input isn't (as otherwise the caller has no way to know its length)
393 wxWCharBuffer
wbuf(dstLen
);
394 wbuf
.data()[dstLen
- 1] = L
'\0';
395 if ( ToWChar(wbuf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
400 if ( wbuf
[dstLen
- 1] == L
'\0' )
411 return wxWCharBuffer();
415 wxMBConv::cWC2MB(const wchar_t *inBuff
, size_t inLen
, size_t *outLen
) const
417 size_t dstLen
= FromWChar(NULL
, 0, inBuff
, inLen
);
418 if ( dstLen
!= wxCONV_FAILED
)
420 const size_t nulLen
= GetMBNulLen();
422 // as above, ensure that the buffer is always NUL-terminated, even if
424 wxCharBuffer
buf(dstLen
+ nulLen
- 1);
425 memset(buf
.data() + dstLen
, 0, nulLen
);
426 if ( FromWChar(buf
.data(), dstLen
, inBuff
, inLen
) != wxCONV_FAILED
)
432 if ( dstLen
>= nulLen
&&
433 !NotAllNULs(buf
.data() + dstLen
- nulLen
, nulLen
) )
435 // in this case the output is NUL-terminated and we're not
436 // supposed to count NUL
448 return wxCharBuffer();
451 // ----------------------------------------------------------------------------
453 // ----------------------------------------------------------------------------
455 size_t wxMBConvLibc::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
457 return wxMB2WC(buf
, psz
, n
);
460 size_t wxMBConvLibc::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
462 return wxWC2MB(buf
, psz
, n
);
465 // ----------------------------------------------------------------------------
466 // wxConvBrokenFileNames
467 // ----------------------------------------------------------------------------
471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString
& charset
)
473 if ( wxStricmp(charset
, _T("UTF-8")) == 0 ||
474 wxStricmp(charset
, _T("UTF8")) == 0 )
475 m_conv
= new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA
);
477 m_conv
= new wxCSConv(charset
);
482 // ----------------------------------------------------------------------------
484 // ----------------------------------------------------------------------------
486 // Implementation (C) 2004 Fredrik Roubert
488 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
491 // BASE64 decoding table
493 static const unsigned char utf7unb64
[] =
495 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
497 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
500 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
501 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
502 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
503 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
504 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
505 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
506 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
507 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
508 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
509 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
510 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
520 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
529 size_t wxMBConvUTF7::ToWChar(wchar_t *dst
, size_t dstLen
,
530 const char *src
, size_t srcLen
) const
532 DecoderState stateOrig
,
534 if ( srcLen
== wxNO_LEN
)
536 // convert the entire string, up to and including the trailing NUL
537 srcLen
= strlen(src
) + 1;
539 // when working on the entire strings we don't update nor use the shift
540 // state from the previous call
541 statePtr
= &stateOrig
;
543 else // when working with partial strings we do use the shift state
545 statePtr
= wx_const_cast(DecoderState
*, &m_stateDecoder
);
547 // also save the old state to be able to rollback to it on error
548 stateOrig
= m_stateDecoder
;
551 // but to simplify the code below we use this variable in both cases
552 DecoderState
& state
= *statePtr
;
555 // number of characters [which would have been] written to dst [if it were
559 const char * const srcEnd
= src
+ srcLen
;
561 while ( (src
< srcEnd
) && (!dst
|| (len
< dstLen
)) )
563 const unsigned char cc
= *src
++;
565 if ( state
.IsShifted() )
567 const unsigned char dc
= utf7unb64
[cc
];
570 // end of encoded part
573 // re-parse this character normally below unless it's '-' which
574 // is consumed by the decoder
578 else // valid encoded character
580 // mini base64 decoder: each character is 6 bits
585 if ( state
.bit
>= 8 )
587 // got the full byte, consume it
589 unsigned char b
= (state
.accum
>> state
.bit
) & 0x00ff;
593 // we've got the full word, output it
595 *dst
++ = (state
.msb
<< 8) | b
;
601 // just store it while we wait for LSB
609 if ( state
.IsDirect() )
611 // start of an encoded segment?
615 return wxCONV_FAILED
; // can't have '+' at the end
619 // just the encoded plus sign, don't switch to shifted mode
632 // only printable 7 bit ASCII characters (with the exception of
633 // NUL, TAB, CR and LF) can be used directly
634 if ( cc
>= 0x7f || (cc
< ' ' &&
635 !(cc
== '\0' || cc
== '\t' || cc
== '\r' || cc
== '\n')) )
636 return wxCONV_FAILED
;
647 // as we didn't read any characters we should be called with the same
648 // data (followed by some more new data) again later so don't save our
652 return wxCONV_FAILED
;
659 // BASE64 encoding table
661 static const unsigned char utf7enb64
[] =
663 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
664 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
665 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
666 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
667 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
668 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
669 'w', 'x', 'y', 'z', '0', '1', '2', '3',
670 '4', '5', '6', '7', '8', '9', '+', '/'
674 // UTF-7 encoding table
676 // 0 - Set D (directly encoded characters)
677 // 1 - Set O (optional direct characters)
678 // 2 - whitespace characters (optional)
679 // 3 - special characters
681 static const unsigned char utf7encode
[128] =
683 0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
684 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
685 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
687 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
689 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
693 static inline bool wxIsUTF7Direct(wchar_t wc
)
695 return wc
< 0x80 && utf7encode
[wc
] < 1;
698 size_t wxMBConvUTF7::FromWChar(char *dst
, size_t dstLen
,
699 const wchar_t *src
, size_t srcLen
) const
701 EncoderState stateOrig
,
703 if ( srcLen
== wxNO_LEN
)
705 // we don't apply the stored state when operating on entire strings at
707 statePtr
= &stateOrig
;
709 srcLen
= wxWcslen(src
) + 1;
711 else // do use the mode we left the output in previously
713 stateOrig
= m_stateEncoder
;
714 statePtr
= wx_const_cast(EncoderState
*, &m_stateEncoder
);
717 EncoderState
& state
= *statePtr
;
722 const wchar_t * const srcEnd
= src
+ srcLen
;
723 while ( src
< srcEnd
&& (!dst
|| len
< dstLen
) )
726 if ( wxIsUTF7Direct(cc
) )
728 if ( state
.IsShifted() )
730 // pad with zeros the last encoded block if necessary
734 *dst
++ = utf7enb64
[((state
.accum
% 16) << (6 - state
.bit
)) % 64];
749 else if ( cc
== '+' && state
.IsDirect() )
760 else if (((wxUint32
)cc
) > 0xffff)
762 // no surrogate pair generation (yet?)
763 return wxCONV_FAILED
;
768 if ( state
.IsDirect() )
777 // BASE64 encode string
780 for ( unsigned lsb
= 0; lsb
< 2; lsb
++ )
783 state
.accum
+= lsb
? cc
& 0xff : (cc
& 0xff00) >> 8;
785 for (state
.bit
+= 8; state
.bit
>= 6; )
789 *dst
++ = utf7enb64
[(state
.accum
>> state
.bit
) % 64];
794 if ( src
== srcEnd
|| wxIsUTF7Direct(cc
= *src
) )
802 // we need to restore the original encoder state if we were called just to
803 // calculate the amount of space needed as we will presumably be called
804 // again to really convert the data now
811 // ----------------------------------------------------------------------------
813 // ----------------------------------------------------------------------------
815 static const wxUint32 utf8_max
[]=
816 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
818 // boundaries of the private use area we use to (temporarily) remap invalid
819 // characters invalid in a UTF-8 encoded string
820 const wxUint32 wxUnicodePUA
= 0x100000;
821 const wxUint32 wxUnicodePUAEnd
= wxUnicodePUA
+ 256;
823 // this table gives the length of the UTF-8 encoding from its first character:
824 const unsigned char tableUtf8Lengths
[256] = {
825 // single-byte sequences (ASCII):
826 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
827 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
828 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
829 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
831 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
832 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
833 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
835 // these are invalid:
836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
842 // two-byte sequences:
843 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
844 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
846 // three-byte sequences:
847 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
849 // four-byte sequences:
850 4, 4, 4, 4, 4, // F0..F4
852 // these are invalid again (5- or 6-byte
853 // sequences and sequences for code points
854 // above U+10FFFF, as restricted by RFC 3629):
855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
859 wxMBConvStrictUTF8::ToWChar(wchar_t *dst
, size_t dstLen
,
860 const char *src
, size_t srcLen
) const
862 wchar_t *out
= dstLen
? dst
: NULL
;
865 if ( srcLen
== wxNO_LEN
)
866 srcLen
= strlen(src
) + 1;
868 for ( const char *p
= src
; ; p
++ )
870 if ( !(srcLen
== wxNO_LEN
? *p
: srcLen
) )
872 // all done successfully, just add the trailing NULL if we are not
873 // using explicit length
874 if ( srcLen
== wxNO_LEN
)
890 if ( out
&& !dstLen
-- )
894 unsigned char c
= *p
;
898 if ( srcLen
== 0 ) // the test works for wxNO_LEN too
901 if ( srcLen
!= wxNO_LEN
)
908 unsigned len
= tableUtf8Lengths
[c
];
912 if ( srcLen
< len
) // the test works for wxNO_LEN too
915 if ( srcLen
!= wxNO_LEN
)
918 // Char. number range | UTF-8 octet sequence
919 // (hexadecimal) | (binary)
920 // ----------------------+----------------------------------------
921 // 0000 0000 - 0000 007F | 0xxxxxxx
922 // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
923 // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
924 // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
926 // Code point value is stored in bits marked with 'x',
927 // lowest-order bit of the value on the right side in the diagram
928 // above. (from RFC 3629)
930 // mask to extract lead byte's value ('x' bits above), by sequence
932 static const unsigned char leadValueMask
[] = { 0x7F, 0x1F, 0x0F, 0x07 };
934 // mask and value of lead byte's most significant bits, by length:
935 static const unsigned char leadMarkerMask
[] = { 0x80, 0xE0, 0xF0, 0xF8 };
936 static const unsigned char leadMarkerVal
[] = { 0x00, 0xC0, 0xE0, 0xF0 };
938 len
--; // it's more convenient to work with 0-based length here
940 // extract the lead byte's value bits:
941 if ( (c
& leadMarkerMask
[len
]) != leadMarkerVal
[len
] )
944 code
= c
& leadValueMask
[len
];
946 // all remaining bytes, if any, are handled in the same way
947 // regardless of sequence's length:
951 if ( (c
& 0xC0) != 0x80 )
952 return wxCONV_FAILED
;
960 // cast is ok because wchar_t == wxUint16 if WC_UTF16
961 if ( encode_utf16(code
, (wxUint16
*)out
) == 2 )
970 #endif // WC_UTF16/!WC_UTF16
978 return wxCONV_FAILED
;
982 wxMBConvStrictUTF8::FromWChar(char *dst
, size_t dstLen
,
983 const wchar_t *src
, size_t srcLen
) const
985 char *out
= dstLen
? dst
: NULL
;
988 for ( const wchar_t *wp
= src
; ; wp
++ )
990 if ( !(srcLen
== wxNO_LEN
? *wp
: srcLen
) )
992 // all done successfully, just add the trailing NULL if we are not
993 // using explicit length
994 if ( srcLen
== wxNO_LEN
)
1010 if ( srcLen
!= wxNO_LEN
)
1015 // cast is ok for WC_UTF16
1016 if ( decode_utf16((const wxUint16
*)wp
, code
) == 2 )
1018 // skip the next char too as we decoded a surrogate
1021 #else // wchar_t is UTF-32
1022 code
= *wp
& 0x7fffffff;
1034 out
[0] = (char)code
;
1037 else if ( code
<= 0x07FF )
1045 // NB: this line takes 6 least significant bits, encodes them as
1046 // 10xxxxxx and discards them so that the next byte can be encoded:
1047 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1048 out
[0] = 0xC0 | code
;
1051 else if ( code
< 0xFFFF )
1059 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1060 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1061 out
[0] = 0xE0 | code
;
1064 else if ( code
<= 0x10FFFF )
1072 out
[3] = 0x80 | (code
& 0x3F); code
>>= 6;
1073 out
[2] = 0x80 | (code
& 0x3F); code
>>= 6;
1074 out
[1] = 0x80 | (code
& 0x3F); code
>>= 6;
1075 out
[0] = 0xF0 | code
;
1080 wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1093 // we only get here if an error occurs during decoding
1094 return wxCONV_FAILED
;
1097 size_t wxMBConvUTF8::ToWChar(wchar_t *buf
, size_t n
,
1098 const char *psz
, size_t srcLen
) const
1100 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1101 return wxMBConvStrictUTF8::ToWChar(buf
, n
, psz
, srcLen
);
1105 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1107 const char *opsz
= psz
;
1108 bool invalid
= false;
1109 unsigned char cc
= *psz
++, fc
= cc
;
1111 for (cnt
= 0; fc
& 0x80; cnt
++)
1121 // escape the escape character for octal escapes
1122 if ((m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1123 && cc
== '\\' && (!buf
|| len
< n
))
1135 // invalid UTF-8 sequence
1140 unsigned ocnt
= cnt
- 1;
1141 wxUint32 res
= cc
& (0x3f >> cnt
);
1145 if ((cc
& 0xC0) != 0x80)
1147 // invalid UTF-8 sequence
1153 res
= (res
<< 6) | (cc
& 0x3f);
1156 if (invalid
|| res
<= utf8_max
[ocnt
])
1158 // illegal UTF-8 encoding
1161 else if ((m_options
& MAP_INVALID_UTF8_TO_PUA
) &&
1162 res
>= wxUnicodePUA
&& res
< wxUnicodePUAEnd
)
1164 // if one of our PUA characters turns up externally
1165 // it must also be treated as an illegal sequence
1166 // (a bit like you have to escape an escape character)
1172 // cast is ok because wchar_t == wxUint16 if WC_UTF16
1173 size_t pa
= encode_utf16(res
, (wxUint16
*)buf
);
1174 if (pa
== wxCONV_FAILED
)
1186 *buf
++ = (wchar_t)res
;
1188 #endif // WC_UTF16/!WC_UTF16
1194 if (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1196 while (opsz
< psz
&& (!buf
|| len
< n
))
1199 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1200 size_t pa
= encode_utf16((unsigned char)*opsz
+ wxUnicodePUA
, (wxUint16
*)buf
);
1201 wxASSERT(pa
!= wxCONV_FAILED
);
1208 *buf
++ = (wchar_t)(wxUnicodePUA
+ (unsigned char)*opsz
);
1214 else if (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1216 while (opsz
< psz
&& (!buf
|| len
< n
))
1218 if ( buf
&& len
+ 3 < n
)
1220 unsigned char on
= *opsz
;
1222 *buf
++ = (wchar_t)( L
'0' + on
/ 0100 );
1223 *buf
++ = (wchar_t)( L
'0' + (on
% 0100) / 010 );
1224 *buf
++ = (wchar_t)( L
'0' + on
% 010 );
1231 else // MAP_INVALID_UTF8_NOT
1233 return wxCONV_FAILED
;
1239 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1245 static inline bool isoctal(wchar_t wch
)
1247 return L
'0' <= wch
&& wch
<= L
'7';
1250 size_t wxMBConvUTF8::FromWChar(char *buf
, size_t n
,
1251 const wchar_t *psz
, size_t srcLen
) const
1253 if ( m_options
== MAP_INVALID_UTF8_NOT
)
1254 return wxMBConvStrictUTF8::FromWChar(buf
, n
, psz
, srcLen
);
1258 while ((srcLen
== wxNO_LEN
? *psz
: srcLen
--) && ((!buf
) || (len
< n
)))
1263 // cast is ok for WC_UTF16
1264 size_t pa
= decode_utf16((const wxUint16
*)psz
, cc
);
1265 psz
+= (pa
== wxCONV_FAILED
) ? 1 : pa
;
1267 cc
= (*psz
++) & 0x7fffffff;
1270 if ( (m_options
& MAP_INVALID_UTF8_TO_PUA
)
1271 && cc
>= wxUnicodePUA
&& cc
< wxUnicodePUAEnd
)
1274 *buf
++ = (char)(cc
- wxUnicodePUA
);
1277 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
)
1278 && cc
== L
'\\' && psz
[0] == L
'\\' )
1285 else if ( (m_options
& MAP_INVALID_UTF8_TO_OCTAL
) &&
1287 isoctal(psz
[0]) && isoctal(psz
[1]) && isoctal(psz
[2]) )
1291 *buf
++ = (char) ((psz
[0] - L
'0') * 0100 +
1292 (psz
[1] - L
'0') * 010 +
1302 for (cnt
= 0; cc
> utf8_max
[cnt
]; cnt
++)
1318 *buf
++ = (char) ((-128 >> cnt
) | ((cc
>> (cnt
* 6)) & (0x3f >> cnt
)));
1320 *buf
++ = (char) (0x80 | ((cc
>> (cnt
* 6)) & 0x3f));
1326 if (srcLen
== wxNO_LEN
&& buf
&& (len
< n
))
1332 // ============================================================================
1334 // ============================================================================
1336 #ifdef WORDS_BIGENDIAN
1337 #define wxMBConvUTF16straight wxMBConvUTF16BE
1338 #define wxMBConvUTF16swap wxMBConvUTF16LE
1340 #define wxMBConvUTF16swap wxMBConvUTF16BE
1341 #define wxMBConvUTF16straight wxMBConvUTF16LE
1345 size_t wxMBConvUTF16Base::GetLength(const char *src
, size_t srcLen
)
1347 if ( srcLen
== wxNO_LEN
)
1349 // count the number of bytes in input, including the trailing NULs
1350 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1351 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1354 srcLen
*= BYTES_PER_CHAR
;
1356 else // we already have the length
1358 // we can only convert an entire number of UTF-16 characters
1359 if ( srcLen
% BYTES_PER_CHAR
)
1360 return wxCONV_FAILED
;
1366 // case when in-memory representation is UTF-16 too
1369 // ----------------------------------------------------------------------------
1370 // conversions without endianness change
1371 // ----------------------------------------------------------------------------
1374 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1375 const char *src
, size_t srcLen
) const
1377 // set up the scene for using memcpy() (which is presumably more efficient
1378 // than copying the bytes one by one)
1379 srcLen
= GetLength(src
, srcLen
);
1380 if ( srcLen
== wxNO_LEN
)
1381 return wxCONV_FAILED
;
1383 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1386 if ( dstLen
< inLen
)
1387 return wxCONV_FAILED
;
1389 memcpy(dst
, src
, srcLen
);
1396 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1397 const wchar_t *src
, size_t srcLen
) const
1399 if ( srcLen
== wxNO_LEN
)
1400 srcLen
= wxWcslen(src
) + 1;
1402 srcLen
*= BYTES_PER_CHAR
;
1406 if ( dstLen
< srcLen
)
1407 return wxCONV_FAILED
;
1409 memcpy(dst
, src
, srcLen
);
1415 // ----------------------------------------------------------------------------
1416 // endian-reversing conversions
1417 // ----------------------------------------------------------------------------
1420 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1421 const char *src
, size_t srcLen
) const
1423 srcLen
= GetLength(src
, srcLen
);
1424 if ( srcLen
== wxNO_LEN
)
1425 return wxCONV_FAILED
;
1427 srcLen
/= BYTES_PER_CHAR
;
1431 if ( dstLen
< srcLen
)
1432 return wxCONV_FAILED
;
1434 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1435 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1437 *dst
++ = wxUINT16_SWAP_ALWAYS(*inBuff
);
1445 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1446 const wchar_t *src
, size_t srcLen
) const
1448 if ( srcLen
== wxNO_LEN
)
1449 srcLen
= wxWcslen(src
) + 1;
1451 srcLen
*= BYTES_PER_CHAR
;
1455 if ( dstLen
< srcLen
)
1456 return wxCONV_FAILED
;
1458 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1459 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1461 *outBuff
++ = wxUINT16_SWAP_ALWAYS(*src
);
1468 #else // !WC_UTF16: wchar_t is UTF-32
1470 // ----------------------------------------------------------------------------
1471 // conversions without endianness change
1472 // ----------------------------------------------------------------------------
1475 wxMBConvUTF16straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1476 const char *src
, size_t srcLen
) const
1478 srcLen
= GetLength(src
, srcLen
);
1479 if ( srcLen
== wxNO_LEN
)
1480 return wxCONV_FAILED
;
1482 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1485 // optimization: return maximal space which could be needed for this
1486 // string even if the real size could be smaller if the buffer contains
1492 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1493 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1495 const wxUint32 ch
= wxDecodeSurrogate(&inBuff
);
1497 return wxCONV_FAILED
;
1499 if ( ++outLen
> dstLen
)
1500 return wxCONV_FAILED
;
1510 wxMBConvUTF16straight::FromWChar(char *dst
, size_t dstLen
,
1511 const wchar_t *src
, size_t srcLen
) const
1513 if ( srcLen
== wxNO_LEN
)
1514 srcLen
= wxWcslen(src
) + 1;
1517 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1518 for ( size_t n
= 0; n
< srcLen
; n
++ )
1521 const size_t numChars
= encode_utf16(*src
++, cc
);
1522 if ( numChars
== wxCONV_FAILED
)
1523 return wxCONV_FAILED
;
1525 outLen
+= numChars
* BYTES_PER_CHAR
;
1528 if ( outLen
> dstLen
)
1529 return wxCONV_FAILED
;
1532 if ( numChars
== 2 )
1534 // second character of a surrogate
1543 // ----------------------------------------------------------------------------
1544 // endian-reversing conversions
1545 // ----------------------------------------------------------------------------
1548 wxMBConvUTF16swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1549 const char *src
, size_t srcLen
) const
1551 srcLen
= GetLength(src
, srcLen
);
1552 if ( srcLen
== wxNO_LEN
)
1553 return wxCONV_FAILED
;
1555 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1558 // optimization: return maximal space which could be needed for this
1559 // string even if the real size could be smaller if the buffer contains
1565 const wxUint16
*inBuff
= wx_reinterpret_cast(const wxUint16
*, src
);
1566 for ( const wxUint16
* const inEnd
= inBuff
+ inLen
; inBuff
< inEnd
; )
1571 tmp
[0] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1573 tmp
[1] = wxUINT16_SWAP_ALWAYS(*inBuff
);
1575 const size_t numChars
= decode_utf16(tmp
, ch
);
1576 if ( numChars
== wxCONV_FAILED
)
1577 return wxCONV_FAILED
;
1579 if ( numChars
== 2 )
1582 if ( ++outLen
> dstLen
)
1583 return wxCONV_FAILED
;
1593 wxMBConvUTF16swap::FromWChar(char *dst
, size_t dstLen
,
1594 const wchar_t *src
, size_t srcLen
) const
1596 if ( srcLen
== wxNO_LEN
)
1597 srcLen
= wxWcslen(src
) + 1;
1600 wxUint16
*outBuff
= wx_reinterpret_cast(wxUint16
*, dst
);
1601 for ( const wchar_t *srcEnd
= src
+ srcLen
; src
< srcEnd
; src
++ )
1604 const size_t numChars
= encode_utf16(*src
, cc
);
1605 if ( numChars
== wxCONV_FAILED
)
1606 return wxCONV_FAILED
;
1608 outLen
+= numChars
* BYTES_PER_CHAR
;
1611 if ( outLen
> dstLen
)
1612 return wxCONV_FAILED
;
1614 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[0]);
1615 if ( numChars
== 2 )
1617 // second character of a surrogate
1618 *outBuff
++ = wxUINT16_SWAP_ALWAYS(cc
[1]);
1626 #endif // WC_UTF16/!WC_UTF16
1629 // ============================================================================
1631 // ============================================================================
1633 #ifdef WORDS_BIGENDIAN
1634 #define wxMBConvUTF32straight wxMBConvUTF32BE
1635 #define wxMBConvUTF32swap wxMBConvUTF32LE
1637 #define wxMBConvUTF32swap wxMBConvUTF32BE
1638 #define wxMBConvUTF32straight wxMBConvUTF32LE
1642 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE
) wxConvUTF32LE
;
1643 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE
) wxConvUTF32BE
;
1646 size_t wxMBConvUTF32Base::GetLength(const char *src
, size_t srcLen
)
1648 if ( srcLen
== wxNO_LEN
)
1650 // count the number of bytes in input, including the trailing NULs
1651 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1652 for ( srcLen
= 1; *inBuff
++; srcLen
++ )
1655 srcLen
*= BYTES_PER_CHAR
;
1657 else // we already have the length
1659 // we can only convert an entire number of UTF-32 characters
1660 if ( srcLen
% BYTES_PER_CHAR
)
1661 return wxCONV_FAILED
;
1667 // case when in-memory representation is UTF-16
1670 // ----------------------------------------------------------------------------
1671 // conversions without endianness change
1672 // ----------------------------------------------------------------------------
1675 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1676 const char *src
, size_t srcLen
) const
1678 srcLen
= GetLength(src
, srcLen
);
1679 if ( srcLen
== wxNO_LEN
)
1680 return wxCONV_FAILED
;
1682 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1683 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1685 for ( size_t n
= 0; n
< inLen
; n
++ )
1688 const size_t numChars
= encode_utf16(*inBuff
++, cc
);
1689 if ( numChars
== wxCONV_FAILED
)
1690 return wxCONV_FAILED
;
1695 if ( outLen
> dstLen
)
1696 return wxCONV_FAILED
;
1699 if ( numChars
== 2 )
1701 // second character of a surrogate
1711 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1712 const wchar_t *src
, size_t srcLen
) const
1714 if ( srcLen
== wxNO_LEN
)
1715 srcLen
= wxWcslen(src
) + 1;
1719 // optimization: return maximal space which could be needed for this
1720 // string instead of the exact amount which could be less if there are
1721 // any surrogates in the input
1723 // we consider that surrogates are rare enough to make it worthwhile to
1724 // avoid running the loop below at the cost of slightly extra memory
1726 return srcLen
* BYTES_PER_CHAR
;
1729 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1731 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1733 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1735 return wxCONV_FAILED
;
1737 outLen
+= BYTES_PER_CHAR
;
1739 if ( outLen
> dstLen
)
1740 return wxCONV_FAILED
;
1748 // ----------------------------------------------------------------------------
1749 // endian-reversing conversions
1750 // ----------------------------------------------------------------------------
1753 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1754 const char *src
, size_t srcLen
) const
1756 srcLen
= GetLength(src
, srcLen
);
1757 if ( srcLen
== wxNO_LEN
)
1758 return wxCONV_FAILED
;
1760 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1761 const size_t inLen
= srcLen
/ BYTES_PER_CHAR
;
1763 for ( size_t n
= 0; n
< inLen
; n
++, inBuff
++ )
1766 const size_t numChars
= encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff
), cc
);
1767 if ( numChars
== wxCONV_FAILED
)
1768 return wxCONV_FAILED
;
1773 if ( outLen
> dstLen
)
1774 return wxCONV_FAILED
;
1777 if ( numChars
== 2 )
1779 // second character of a surrogate
1789 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1790 const wchar_t *src
, size_t srcLen
) const
1792 if ( srcLen
== wxNO_LEN
)
1793 srcLen
= wxWcslen(src
) + 1;
1797 // optimization: return maximal space which could be needed for this
1798 // string instead of the exact amount which could be less if there are
1799 // any surrogates in the input
1801 // we consider that surrogates are rare enough to make it worthwhile to
1802 // avoid running the loop below at the cost of slightly extra memory
1804 return srcLen
*BYTES_PER_CHAR
;
1807 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1809 for ( const wchar_t * const srcEnd
= src
+ srcLen
; src
< srcEnd
; )
1811 const wxUint32 ch
= wxDecodeSurrogate(&src
);
1813 return wxCONV_FAILED
;
1815 outLen
+= BYTES_PER_CHAR
;
1817 if ( outLen
> dstLen
)
1818 return wxCONV_FAILED
;
1820 *outBuff
++ = wxUINT32_SWAP_ALWAYS(ch
);
1826 #else // !WC_UTF16: wchar_t is UTF-32
1828 // ----------------------------------------------------------------------------
1829 // conversions without endianness change
1830 // ----------------------------------------------------------------------------
1833 wxMBConvUTF32straight::ToWChar(wchar_t *dst
, size_t dstLen
,
1834 const char *src
, size_t srcLen
) const
1836 // use memcpy() as it should be much faster than hand-written loop
1837 srcLen
= GetLength(src
, srcLen
);
1838 if ( srcLen
== wxNO_LEN
)
1839 return wxCONV_FAILED
;
1841 const size_t inLen
= srcLen
/BYTES_PER_CHAR
;
1844 if ( dstLen
< inLen
)
1845 return wxCONV_FAILED
;
1847 memcpy(dst
, src
, srcLen
);
1854 wxMBConvUTF32straight::FromWChar(char *dst
, size_t dstLen
,
1855 const wchar_t *src
, size_t srcLen
) const
1857 if ( srcLen
== wxNO_LEN
)
1858 srcLen
= wxWcslen(src
) + 1;
1860 srcLen
*= BYTES_PER_CHAR
;
1864 if ( dstLen
< srcLen
)
1865 return wxCONV_FAILED
;
1867 memcpy(dst
, src
, srcLen
);
1873 // ----------------------------------------------------------------------------
1874 // endian-reversing conversions
1875 // ----------------------------------------------------------------------------
1878 wxMBConvUTF32swap::ToWChar(wchar_t *dst
, size_t dstLen
,
1879 const char *src
, size_t srcLen
) const
1881 srcLen
= GetLength(src
, srcLen
);
1882 if ( srcLen
== wxNO_LEN
)
1883 return wxCONV_FAILED
;
1885 srcLen
/= BYTES_PER_CHAR
;
1889 if ( dstLen
< srcLen
)
1890 return wxCONV_FAILED
;
1892 const wxUint32
*inBuff
= wx_reinterpret_cast(const wxUint32
*, src
);
1893 for ( size_t n
= 0; n
< srcLen
; n
++, inBuff
++ )
1895 *dst
++ = wxUINT32_SWAP_ALWAYS(*inBuff
);
1903 wxMBConvUTF32swap::FromWChar(char *dst
, size_t dstLen
,
1904 const wchar_t *src
, size_t srcLen
) const
1906 if ( srcLen
== wxNO_LEN
)
1907 srcLen
= wxWcslen(src
) + 1;
1909 srcLen
*= BYTES_PER_CHAR
;
1913 if ( dstLen
< srcLen
)
1914 return wxCONV_FAILED
;
1916 wxUint32
*outBuff
= wx_reinterpret_cast(wxUint32
*, dst
);
1917 for ( size_t n
= 0; n
< srcLen
; n
+= BYTES_PER_CHAR
, src
++ )
1919 *outBuff
++ = wxUINT32_SWAP_ALWAYS(*src
);
1926 #endif // WC_UTF16/!WC_UTF16
1929 // ============================================================================
1930 // The classes doing conversion using the iconv_xxx() functions
1931 // ============================================================================
1935 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1936 // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1937 // (unless there's yet another bug in glibc) the only case when iconv()
1938 // returns with (size_t)-1 (which means error) and says there are 0 bytes
1939 // left in the input buffer -- when _real_ error occurs,
1940 // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1942 // [This bug does not appear in glibc 2.2.]
1943 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1944 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1945 (errno != E2BIG || bufLeft != 0))
1947 #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1950 #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1952 #define ICONV_T_INVALID ((iconv_t)-1)
1954 #if SIZEOF_WCHAR_T == 4
1955 #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1956 #define WC_ENC wxFONTENCODING_UTF32
1957 #elif SIZEOF_WCHAR_T == 2
1958 #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1959 #define WC_ENC wxFONTENCODING_UTF16
1960 #else // sizeof(wchar_t) != 2 nor 4
1961 // does this ever happen?
1962 #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1965 // ----------------------------------------------------------------------------
1966 // wxMBConv_iconv: encapsulates an iconv character set
1967 // ----------------------------------------------------------------------------
1969 class wxMBConv_iconv
: public wxMBConv
1972 wxMBConv_iconv(const char *name
);
1973 virtual ~wxMBConv_iconv();
1975 // implement base class virtual methods
1976 virtual size_t ToWChar(wchar_t *dst
, size_t dstLen
,
1977 const char *src
, size_t srcLen
= wxNO_LEN
) const;
1978 virtual size_t FromWChar(char *dst
, size_t dstLen
,
1979 const wchar_t *src
, size_t srcLen
= wxNO_LEN
) const;
1980 virtual size_t GetMBNulLen() const;
1982 #if wxUSE_UNICODE_UTF8
1983 virtual bool IsUTF8() const;
1986 virtual wxMBConv
*Clone() const
1988 wxMBConv_iconv
*p
= new wxMBConv_iconv(m_name
.ToAscii());
1989 p
->m_minMBCharWidth
= m_minMBCharWidth
;
1994 { return (m2w
!= ICONV_T_INVALID
) && (w2m
!= ICONV_T_INVALID
); }
1997 // the iconv handlers used to translate from multibyte
1998 // to wide char and in the other direction
2003 // guards access to m2w and w2m objects
2004 wxMutex m_iconvMutex
;
2008 // the name (for iconv_open()) of a wide char charset -- if none is
2009 // available on this machine, it will remain NULL
2010 static wxString ms_wcCharsetName
;
2012 // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2013 // different endian-ness than the native one
2014 static bool ms_wcNeedsSwap
;
2017 // name of the encoding handled by this conversion
2020 // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2022 size_t m_minMBCharWidth
;
2025 // make the constructor available for unit testing
2026 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_iconv( const char* name
)
2028 wxMBConv_iconv
* result
= new wxMBConv_iconv( name
);
2029 if ( !result
->IsOk() )
2038 wxString
wxMBConv_iconv::ms_wcCharsetName
;
2039 bool wxMBConv_iconv::ms_wcNeedsSwap
= false;
2041 wxMBConv_iconv::wxMBConv_iconv(const char *name
)
2044 m_minMBCharWidth
= 0;
2046 // check for charset that represents wchar_t:
2047 if ( ms_wcCharsetName
.empty() )
2049 wxLogTrace(TRACE_STRCONV
, _T("Looking for wide char codeset:"));
2052 const wxChar
**names
= wxFontMapperBase::GetAllEncodingNames(WC_ENC
);
2053 #else // !wxUSE_FONTMAP
2054 static const wxChar
*names_static
[] =
2056 #if SIZEOF_WCHAR_T == 4
2058 #elif SIZEOF_WCHAR_T = 2
2063 const wxChar
**names
= names_static
;
2064 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2066 for ( ; *names
&& ms_wcCharsetName
.empty(); ++names
)
2068 const wxString
nameCS(*names
);
2070 // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2071 wxString
nameXE(nameCS
);
2073 #ifdef WORDS_BIGENDIAN
2075 #else // little endian
2079 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2082 m2w
= iconv_open(nameXE
.ToAscii(), name
);
2083 if ( m2w
== ICONV_T_INVALID
)
2085 // try charset w/o bytesex info (e.g. "UCS4")
2086 wxLogTrace(TRACE_STRCONV
, _T(" trying charset \"%s\""),
2088 m2w
= iconv_open(nameCS
.ToAscii(), name
);
2090 // and check for bytesex ourselves:
2091 if ( m2w
!= ICONV_T_INVALID
)
2093 char buf
[2], *bufPtr
;
2102 outsz
= SIZEOF_WCHAR_T
* 2;
2103 char* wbufPtr
= (char*)wbuf
;
2107 m2w
, ICONV_CHAR_CAST(&bufPtr
), &insz
,
2110 if (ICONV_FAILED(res
, insz
))
2112 wxLogLastError(wxT("iconv"));
2113 wxLogError(_("Conversion to charset '%s' doesn't work."),
2116 else // ok, can convert to this encoding, remember it
2118 ms_wcCharsetName
= nameCS
;
2119 ms_wcNeedsSwap
= wbuf
[0] != (wchar_t)buf
[0];
2123 else // use charset not requiring byte swapping
2125 ms_wcCharsetName
= nameXE
;
2129 wxLogTrace(TRACE_STRCONV
,
2130 wxT("iconv wchar_t charset is \"%s\"%s"),
2131 ms_wcCharsetName
.empty() ? wxString("<none>")
2133 ms_wcNeedsSwap
? _T(" (needs swap)")
2136 else // we already have ms_wcCharsetName
2138 m2w
= iconv_open(ms_wcCharsetName
.ToAscii(), name
);
2141 if ( ms_wcCharsetName
.empty() )
2143 w2m
= ICONV_T_INVALID
;
2147 w2m
= iconv_open(name
, ms_wcCharsetName
.ToAscii());
2148 if ( w2m
== ICONV_T_INVALID
)
2150 wxLogTrace(TRACE_STRCONV
,
2151 wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2152 ms_wcCharsetName
.c_str(), name
);
2157 wxMBConv_iconv::~wxMBConv_iconv()
2159 if ( m2w
!= ICONV_T_INVALID
)
2161 if ( w2m
!= ICONV_T_INVALID
)
2166 wxMBConv_iconv::ToWChar(wchar_t *dst
, size_t dstLen
,
2167 const char *src
, size_t srcLen
) const
2169 if ( srcLen
== wxNO_LEN
)
2171 // find the string length: notice that must be done differently for
2172 // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2174 const size_t nulLen
= GetMBNulLen();
2178 return wxCONV_FAILED
;
2181 srcLen
= strlen(src
); // arguably more optimized than our version
2186 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2187 // but they also have to start at character boundary and not
2188 // span two adjacent characters
2190 for ( p
= src
; NotAllNULs(p
, nulLen
); p
+= nulLen
)
2196 // when we're determining the length of the string ourselves we count
2197 // the terminating NUL(s) as part of it and always NUL-terminate the
2202 // we express length in the number of (wide) characters but iconv always
2203 // counts buffer sizes it in bytes
2204 dstLen
*= SIZEOF_WCHAR_T
;
2207 // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2208 // Unfortunately there are a couple of global wxCSConv objects such as
2209 // wxConvLocal that are used all over wx code, so we have to make sure
2210 // the handle is used by at most one thread at the time. Otherwise
2211 // only a few wx classes would be safe to use from non-main threads
2212 // as MB<->WC conversion would fail "randomly".
2213 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2214 #endif // wxUSE_THREADS
2217 const char *pszPtr
= src
;
2221 char* bufPtr
= (char*)dst
;
2223 // have destination buffer, convert there
2224 size_t dstLenOrig
= dstLen
;
2226 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2229 // convert the number of bytes converted as returned by iconv to the
2230 // number of (wide) characters converted that we need
2231 res
= (dstLenOrig
- dstLen
) / SIZEOF_WCHAR_T
;
2235 // convert to native endianness
2236 for ( unsigned i
= 0; i
< res
; i
++ )
2237 dst
[i
] = WC_BSWAP(dst
[i
]);
2240 else // no destination buffer
2242 // convert using temp buffer to calculate the size of the buffer needed
2248 char* bufPtr
= (char*)tbuf
;
2249 dstLen
= 8 * SIZEOF_WCHAR_T
;
2252 ICONV_CHAR_CAST(&pszPtr
), &srcLen
,
2255 res
+= 8 - (dstLen
/ SIZEOF_WCHAR_T
);
2257 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2260 if (ICONV_FAILED(cres
, srcLen
))
2262 //VS: it is ok if iconv fails, hence trace only
2263 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2264 return wxCONV_FAILED
;
2270 size_t wxMBConv_iconv::FromWChar(char *dst
, size_t dstLen
,
2271 const wchar_t *src
, size_t srcLen
) const
2274 // NB: explained in MB2WC
2275 wxMutexLocker
lock(wxConstCast(this, wxMBConv_iconv
)->m_iconvMutex
);
2278 if ( srcLen
== wxNO_LEN
)
2279 srcLen
= wxWcslen(src
) + 1;
2281 size_t inbuflen
= srcLen
* SIZEOF_WCHAR_T
;
2282 size_t outbuflen
= dstLen
;
2285 wchar_t *tmpbuf
= 0;
2289 // need to copy to temp buffer to switch endianness
2290 // (doing WC_BSWAP twice on the original buffer won't help, as it
2291 // could be in read-only memory, or be accessed in some other thread)
2292 tmpbuf
= (wchar_t *)malloc(inbuflen
+ SIZEOF_WCHAR_T
);
2293 for ( size_t i
= 0; i
< srcLen
; i
++ )
2294 tmpbuf
[i
] = WC_BSWAP(src
[i
]);
2296 tmpbuf
[srcLen
] = L
'\0';
2300 char* inbuf
= (char*)src
;
2303 // have destination buffer, convert there
2304 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2306 res
= dstLen
- outbuflen
;
2308 else // no destination buffer
2310 // convert using temp buffer to calculate the size of the buffer needed
2318 cres
= iconv(w2m
, ICONV_CHAR_CAST(&inbuf
), &inbuflen
, &dst
, &outbuflen
);
2320 res
+= 16 - outbuflen
;
2322 while ((cres
== (size_t)-1) && (errno
== E2BIG
));
2330 if (ICONV_FAILED(cres
, inbuflen
))
2332 wxLogTrace(TRACE_STRCONV
, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2333 return wxCONV_FAILED
;
2339 size_t wxMBConv_iconv::GetMBNulLen() const
2341 if ( m_minMBCharWidth
== 0 )
2343 wxMBConv_iconv
* const self
= wxConstCast(this, wxMBConv_iconv
);
2346 // NB: explained in MB2WC
2347 wxMutexLocker
lock(self
->m_iconvMutex
);
2350 const wchar_t *wnul
= L
"";
2351 char buf
[8]; // should be enough for NUL in any encoding
2352 size_t inLen
= sizeof(wchar_t),
2353 outLen
= WXSIZEOF(buf
);
2354 char *inBuff
= (char *)wnul
;
2355 char *outBuff
= buf
;
2356 if ( iconv(w2m
, ICONV_CHAR_CAST(&inBuff
), &inLen
, &outBuff
, &outLen
) == (size_t)-1 )
2358 self
->m_minMBCharWidth
= (size_t)-1;
2362 self
->m_minMBCharWidth
= outBuff
- buf
;
2366 return m_minMBCharWidth
;
2369 #if wxUSE_UNICODE_UTF8
2370 bool wxMBConv_iconv::IsUTF8() const
2372 return wxStricmp(m_name
, "UTF-8") == 0 ||
2373 wxStricmp(m_name
, "UTF8") == 0;
2377 #endif // HAVE_ICONV
2380 // ============================================================================
2381 // Win32 conversion classes
2382 // ============================================================================
2384 #ifdef wxHAVE_WIN32_MB2WC
2388 extern WXDLLIMPEXP_BASE
long wxCharsetToCodepage(const char *charset
);
2389 extern WXDLLIMPEXP_BASE
long wxEncodingToCodepage(wxFontEncoding encoding
);
2392 class wxMBConv_win32
: public wxMBConv
2397 m_CodePage
= CP_ACP
;
2398 m_minMBCharWidth
= 0;
2401 wxMBConv_win32(const wxMBConv_win32
& conv
)
2404 m_CodePage
= conv
.m_CodePage
;
2405 m_minMBCharWidth
= conv
.m_minMBCharWidth
;
2409 wxMBConv_win32(const char* name
)
2411 m_CodePage
= wxCharsetToCodepage(name
);
2412 m_minMBCharWidth
= 0;
2415 wxMBConv_win32(wxFontEncoding encoding
)
2417 m_CodePage
= wxEncodingToCodepage(encoding
);
2418 m_minMBCharWidth
= 0;
2420 #endif // wxUSE_FONTMAP
2422 virtual size_t MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
2424 // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2425 // the behaviour is not compatible with the Unix version (using iconv)
2426 // and break the library itself, e.g. wxTextInputStream::NextChar()
2427 // wouldn't work if reading an incomplete MB char didn't result in an
2430 // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2431 // Win XP or newer and it is not supported for UTF-[78] so we always
2432 // use our own conversions in this case. See
2433 // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2434 // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2435 if ( m_CodePage
== CP_UTF8
)
2437 return wxMBConvUTF8().MB2WC(buf
, psz
, n
);
2440 if ( m_CodePage
== CP_UTF7
)
2442 return wxMBConvUTF7().MB2WC(buf
, psz
, n
);
2446 if ( (m_CodePage
< 50000 && m_CodePage
!= CP_SYMBOL
) &&
2447 IsAtLeastWin2kSP4() )
2449 flags
= MB_ERR_INVALID_CHARS
;
2452 const size_t len
= ::MultiByteToWideChar
2454 m_CodePage
, // code page
2455 flags
, // flags: fall on error
2456 psz
, // input string
2457 -1, // its length (NUL-terminated)
2458 buf
, // output string
2459 buf
? n
: 0 // size of output buffer
2463 // function totally failed
2464 return wxCONV_FAILED
;
2467 // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2468 // check if we succeeded, by doing a double trip:
2469 if ( !flags
&& buf
)
2471 const size_t mbLen
= strlen(psz
);
2472 wxCharBuffer
mbBuf(mbLen
);
2473 if ( ::WideCharToMultiByte
2480 mbLen
+ 1, // size in bytes, not length
2484 strcmp(mbBuf
, psz
) != 0 )
2486 // we didn't obtain the same thing we started from, hence
2487 // the conversion was lossy and we consider that it failed
2488 return wxCONV_FAILED
;
2492 // note that it returns count of written chars for buf != NULL and size
2493 // of the needed buffer for buf == NULL so in either case the length of
2494 // the string (which never includes the terminating NUL) is one less
2498 virtual size_t WC2MB(char *buf
, const wchar_t *pwz
, size_t n
) const
2501 we have a problem here: by default, WideCharToMultiByte() may
2502 replace characters unrepresentable in the target code page with bad
2503 quality approximations such as turning "1/2" symbol (U+00BD) into
2504 "1" for the code pages which don't have it and we, obviously, want
2505 to avoid this at any price
2507 the trouble is that this function does it _silently_, i.e. it won't
2508 even tell us whether it did or not... Win98/2000 and higher provide
2509 WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2510 we have to resort to a round trip, i.e. check that converting back
2511 results in the same string -- this is, of course, expensive but
2512 otherwise we simply can't be sure to not garble the data.
2515 // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2516 // it doesn't work with CJK encodings (which we test for rather roughly
2517 // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2519 BOOL usedDef
wxDUMMY_INITIALIZE(false);
2522 if ( CanUseNoBestFit() && m_CodePage
< 50000 )
2524 // it's our lucky day
2525 flags
= WC_NO_BEST_FIT_CHARS
;
2526 pUsedDef
= &usedDef
;
2528 else // old system or unsupported encoding
2534 const size_t len
= ::WideCharToMultiByte
2536 m_CodePage
, // code page
2537 flags
, // either none or no best fit
2538 pwz
, // input string
2539 -1, // it is (wide) NUL-terminated
2540 buf
, // output buffer
2541 buf
? n
: 0, // and its size
2542 NULL
, // default "replacement" char
2543 pUsedDef
// [out] was it used?
2548 // function totally failed
2549 return wxCONV_FAILED
;
2552 // we did something, check if we really succeeded
2555 // check if the conversion failed, i.e. if any replacements
2558 return wxCONV_FAILED
;
2560 else // we must resort to double tripping...
2562 // first we need to ensure that we really have the MB data: this is
2563 // not the case if we're called with NULL buffer, in which case we
2564 // need to do the conversion yet again
2565 wxCharBuffer bufDef
;
2568 bufDef
= wxCharBuffer(len
);
2569 buf
= bufDef
.data();
2570 if ( !::WideCharToMultiByte(m_CodePage
, flags
, pwz
, -1,
2571 buf
, len
, NULL
, NULL
) )
2572 return wxCONV_FAILED
;
2577 wxWCharBuffer
wcBuf(n
);
2578 if ( MB2WC(wcBuf
.data(), buf
, n
+ 1) == wxCONV_FAILED
||
2579 wcscmp(wcBuf
, pwz
) != 0 )
2581 // we didn't obtain the same thing we started from, hence
2582 // the conversion was lossy and we consider that it failed
2583 return wxCONV_FAILED
;
2587 // see the comment above for the reason of "len - 1"
2591 virtual size_t GetMBNulLen() const
2593 if ( m_minMBCharWidth
== 0 )
2595 int len
= ::WideCharToMultiByte
2597 m_CodePage
, // code page
2599 L
"", // input string
2600 1, // translate just the NUL
2601 NULL
, // output buffer
2603 NULL
, // no replacement char
2604 NULL
// [out] don't care if it was used
2607 wxMBConv_win32
* const self
= wxConstCast(this, wxMBConv_win32
);
2611 wxLogDebug(_T("Unexpected NUL length %d"), len
);
2612 self
->m_minMBCharWidth
= (size_t)-1;
2616 self
->m_minMBCharWidth
= (size_t)-1;
2622 self
->m_minMBCharWidth
= len
;
2627 return m_minMBCharWidth
;
2630 virtual wxMBConv
*Clone() const { return new wxMBConv_win32(*this); }
2632 bool IsOk() const { return m_CodePage
!= -1; }
2635 static bool CanUseNoBestFit()
2637 static int s_isWin98Or2k
= -1;
2639 if ( s_isWin98Or2k
== -1 )
2642 switch ( wxGetOsVersion(&verMaj
, &verMin
) )
2644 case wxOS_WINDOWS_9X
:
2645 s_isWin98Or2k
= verMaj
>= 4 && verMin
>= 10;
2648 case wxOS_WINDOWS_NT
:
2649 s_isWin98Or2k
= verMaj
>= 5;
2653 // unknown: be conservative by default
2658 wxASSERT_MSG( s_isWin98Or2k
!= -1, _T("should be set above") );
2661 return s_isWin98Or2k
== 1;
2664 static bool IsAtLeastWin2kSP4()
2669 static int s_isAtLeastWin2kSP4
= -1;
2671 if ( s_isAtLeastWin2kSP4
== -1 )
2673 OSVERSIONINFOEX ver
;
2675 memset(&ver
, 0, sizeof(ver
));
2676 ver
.dwOSVersionInfoSize
= sizeof(ver
);
2677 GetVersionEx((OSVERSIONINFO
*)&ver
);
2679 s_isAtLeastWin2kSP4
=
2680 ((ver
.dwMajorVersion
> 5) || // Vista+
2681 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
> 0) || // XP/2003
2682 (ver
.dwMajorVersion
== 5 && ver
.dwMinorVersion
== 0 &&
2683 ver
.wServicePackMajor
>= 4)) // 2000 SP4+
2687 return s_isAtLeastWin2kSP4
== 1;
2692 // the code page we're working with
2695 // cached result of GetMBNulLen(), set to 0 initially meaning
2697 size_t m_minMBCharWidth
;
2700 #endif // wxHAVE_WIN32_MB2WC
2703 // ============================================================================
2704 // wxEncodingConverter based conversion classes
2705 // ============================================================================
2709 class wxMBConv_wxwin
: public wxMBConv
2714 // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2715 // The wxMBConv_cf class does a better job.
2716 m_ok
= (m_enc
< wxFONTENCODING_MACMIN
|| m_enc
> wxFONTENCODING_MACMAX
) &&
2717 m2w
.Init(m_enc
, wxFONTENCODING_UNICODE
) &&
2718 w2m
.Init(wxFONTENCODING_UNICODE
, m_enc
);
2722 // temporarily just use wxEncodingConverter stuff,
2723 // so that it works while a better implementation is built
2724 wxMBConv_wxwin(const char* name
)
2727 m_enc
= wxFontMapperBase::Get()->CharsetToEncoding(name
, false);
2729 m_enc
= wxFONTENCODING_SYSTEM
;
2734 wxMBConv_wxwin(wxFontEncoding enc
)
2741 size_t MB2WC(wchar_t *buf
, const char *psz
, size_t WXUNUSED(n
)) const
2743 size_t inbuf
= strlen(psz
);
2746 if (!m2w
.Convert(psz
, buf
))
2747 return wxCONV_FAILED
;
2752 size_t WC2MB(char *buf
, const wchar_t *psz
, size_t WXUNUSED(n
)) const
2754 const size_t inbuf
= wxWcslen(psz
);
2757 if (!w2m
.Convert(psz
, buf
))
2758 return wxCONV_FAILED
;
2764 virtual size_t GetMBNulLen() const
2768 case wxFONTENCODING_UTF16BE
:
2769 case wxFONTENCODING_UTF16LE
:
2772 case wxFONTENCODING_UTF32BE
:
2773 case wxFONTENCODING_UTF32LE
:
2781 virtual wxMBConv
*Clone() const { return new wxMBConv_wxwin(m_enc
); }
2783 bool IsOk() const { return m_ok
; }
2786 wxFontEncoding m_enc
;
2787 wxEncodingConverter m2w
, w2m
;
2790 // were we initialized successfully?
2793 DECLARE_NO_COPY_CLASS(wxMBConv_wxwin
)
2796 // make the constructors available for unit testing
2797 WXDLLIMPEXP_BASE wxMBConv
* new_wxMBConv_wxwin( const char* name
)
2799 wxMBConv_wxwin
* result
= new wxMBConv_wxwin( name
);
2800 if ( !result
->IsOk() )
2809 #endif // wxUSE_FONTMAP
2811 // ============================================================================
2812 // wxCSConv implementation
2813 // ============================================================================
2815 void wxCSConv::Init()
2822 wxCSConv::wxCSConv(const wxString
& charset
)
2826 if ( !charset
.empty() )
2828 SetName(charset
.ToAscii());
2832 m_encoding
= wxFontMapperBase::GetEncodingFromName(charset
);
2834 m_encoding
= wxFONTENCODING_SYSTEM
;
2838 wxCSConv::wxCSConv(wxFontEncoding encoding
)
2840 if ( encoding
== wxFONTENCODING_MAX
|| encoding
== wxFONTENCODING_DEFAULT
)
2842 wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2844 encoding
= wxFONTENCODING_SYSTEM
;
2849 m_encoding
= encoding
;
2852 wxCSConv::~wxCSConv()
2857 wxCSConv::wxCSConv(const wxCSConv
& conv
)
2862 SetName(conv
.m_name
);
2863 m_encoding
= conv
.m_encoding
;
2866 wxCSConv
& wxCSConv::operator=(const wxCSConv
& conv
)
2870 SetName(conv
.m_name
);
2871 m_encoding
= conv
.m_encoding
;
2876 void wxCSConv::Clear()
2885 void wxCSConv::SetName(const char *charset
)
2889 m_name
= wxStrdup(charset
);
2896 WX_DECLARE_HASH_MAP( wxFontEncoding
, wxString
, wxIntegerHash
, wxIntegerEqual
,
2897 wxEncodingNameCache
);
2899 static wxEncodingNameCache gs_nameCache
;
2902 wxMBConv
*wxCSConv::DoCreate() const
2905 wxLogTrace(TRACE_STRCONV
,
2906 wxT("creating conversion for %s"),
2908 : (const char*)wxFontMapperBase::GetEncodingName(m_encoding
).mb_str()));
2909 #endif // wxUSE_FONTMAP
2911 // check for the special case of ASCII or ISO8859-1 charset: as we have
2912 // special knowledge of it anyhow, we don't need to create a special
2913 // conversion object
2914 if ( m_encoding
== wxFONTENCODING_ISO8859_1
||
2915 m_encoding
== wxFONTENCODING_DEFAULT
)
2917 // don't convert at all
2921 // we trust OS to do conversion better than we can so try external
2922 // conversion methods first
2924 // the full order is:
2925 // 1. OS conversion (iconv() under Unix or Win32 API)
2926 // 2. hard coded conversions for UTF
2927 // 3. wxEncodingConverter as fall back
2933 #endif // !wxUSE_FONTMAP
2936 wxFontEncoding
encoding(m_encoding
);
2941 wxMBConv_iconv
*conv
= new wxMBConv_iconv(m_name
);
2949 wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
2950 #endif // wxUSE_FONTMAP
2954 const wxEncodingNameCache::iterator it
= gs_nameCache
.find(encoding
);
2955 if ( it
!= gs_nameCache
.end() )
2957 if ( it
->second
.empty() )
2960 wxMBConv_iconv
*conv
= new wxMBConv_iconv(it
->second
.ToAscii());
2967 const wxChar
** names
= wxFontMapperBase::GetAllEncodingNames(encoding
);
2968 // CS : in case this does not return valid names (eg for MacRoman)
2969 // encoding got a 'failure' entry in the cache all the same,
2970 // although it just has to be created using a different method, so
2971 // only store failed iconv creation attempts (or perhaps we
2972 // shoulnd't do this at all ?)
2973 if ( names
[0] != NULL
)
2975 for ( ; *names
; ++names
)
2977 // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2978 // will need changes that will obsolete this
2979 wxString
name(*names
);
2980 wxMBConv_iconv
*conv
= new wxMBConv_iconv(name
.ToAscii());
2983 gs_nameCache
[encoding
] = *names
;
2990 gs_nameCache
[encoding
] = _T(""); // cache the failure
2993 #endif // wxUSE_FONTMAP
2995 #endif // HAVE_ICONV
2997 #ifdef wxHAVE_WIN32_MB2WC
3000 wxMBConv_win32
*conv
= m_name
? new wxMBConv_win32(m_name
)
3001 : new wxMBConv_win32(m_encoding
);
3010 #endif // wxHAVE_WIN32_MB2WC
3014 // leave UTF16 and UTF32 to the built-ins of wx
3015 if ( m_name
|| ( m_encoding
< wxFONTENCODING_UTF16BE
||
3016 ( m_encoding
>= wxFONTENCODING_MACMIN
&& m_encoding
<= wxFONTENCODING_MACMAX
) ) )
3019 wxMBConv_cf
*conv
= m_name
? new wxMBConv_cf(m_name
)
3020 : new wxMBConv_cf(m_encoding
);
3022 wxMBConv_cf
*conv
= new wxMBConv_cf(m_encoding
);
3031 #endif // __DARWIN__
3034 wxFontEncoding enc
= m_encoding
;
3036 if ( enc
== wxFONTENCODING_SYSTEM
&& m_name
)
3038 // use "false" to suppress interactive dialogs -- we can be called from
3039 // anywhere and popping up a dialog from here is the last thing we want to
3041 enc
= wxFontMapperBase::Get()->CharsetToEncoding(m_name
, false);
3043 #endif // wxUSE_FONTMAP
3047 case wxFONTENCODING_UTF7
:
3048 return new wxMBConvUTF7
;
3050 case wxFONTENCODING_UTF8
:
3051 return new wxMBConvUTF8
;
3053 case wxFONTENCODING_UTF16BE
:
3054 return new wxMBConvUTF16BE
;
3056 case wxFONTENCODING_UTF16LE
:
3057 return new wxMBConvUTF16LE
;
3059 case wxFONTENCODING_UTF32BE
:
3060 return new wxMBConvUTF32BE
;
3062 case wxFONTENCODING_UTF32LE
:
3063 return new wxMBConvUTF32LE
;
3066 // nothing to do but put here to suppress gcc warnings
3073 wxMBConv_wxwin
*conv
= m_name
? new wxMBConv_wxwin(m_name
)
3074 : new wxMBConv_wxwin(m_encoding
);
3080 #endif // wxUSE_FONTMAP
3082 // NB: This is a hack to prevent deadlock. What could otherwise happen
3083 // in Unicode build: wxConvLocal creation ends up being here
3084 // because of some failure and logs the error. But wxLog will try to
3085 // attach a timestamp, for which it will need wxConvLocal (to convert
3086 // time to char* and then wchar_t*), but that fails, tries to log the
3087 // error, but wxLog has an (already locked) critical section that
3088 // guards the static buffer.
3089 static bool alreadyLoggingError
= false;
3090 if (!alreadyLoggingError
)
3092 alreadyLoggingError
= true;
3093 wxLogError(_("Cannot convert from the charset '%s'!"),
3097 (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding
).ToAscii()
3098 #else // !wxUSE_FONTMAP
3099 (const char*)wxString::Format(_("encoding %i"), m_encoding
).ToAscii()
3100 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3103 alreadyLoggingError
= false;
3109 void wxCSConv::CreateConvIfNeeded() const
3113 wxCSConv
*self
= (wxCSConv
*)this; // const_cast
3115 // if we don't have neither the name nor the encoding, use the default
3116 // encoding for this system
3117 if ( !m_name
&& m_encoding
== wxFONTENCODING_SYSTEM
)
3120 self
->m_encoding
= wxLocale::GetSystemEncoding();
3122 // fallback to some reasonable default:
3123 self
->m_encoding
= wxFONTENCODING_ISO8859_1
;
3124 #endif // wxUSE_INTL
3127 self
->m_convReal
= DoCreate();
3128 self
->m_deferred
= false;
3132 bool wxCSConv::IsOk() const
3134 CreateConvIfNeeded();
3136 // special case: no convReal created for wxFONTENCODING_ISO8859_1
3137 if ( m_encoding
== wxFONTENCODING_ISO8859_1
)
3138 return true; // always ok as we do it ourselves
3140 // m_convReal->IsOk() is called at its own creation, so we know it must
3141 // be ok if m_convReal is non-NULL
3142 return m_convReal
!= NULL
;
3145 size_t wxCSConv::ToWChar(wchar_t *dst
, size_t dstLen
,
3146 const char *src
, size_t srcLen
) const
3148 CreateConvIfNeeded();
3151 return m_convReal
->ToWChar(dst
, dstLen
, src
, srcLen
);
3154 return wxMBConv::ToWChar(dst
, dstLen
, src
, srcLen
);
3157 size_t wxCSConv::FromWChar(char *dst
, size_t dstLen
,
3158 const wchar_t *src
, size_t srcLen
) const
3160 CreateConvIfNeeded();
3163 return m_convReal
->FromWChar(dst
, dstLen
, src
, srcLen
);
3166 return wxMBConv::FromWChar(dst
, dstLen
, src
, srcLen
);
3169 size_t wxCSConv::MB2WC(wchar_t *buf
, const char *psz
, size_t n
) const
3171 CreateConvIfNeeded();
3174 return m_convReal
->MB2WC(buf
, psz
, n
);
3177 size_t len
= strlen(psz
);
3181 for (size_t c
= 0; c
<= len
; c
++)
3182 buf
[c
] = (unsigned char)(psz
[c
]);
3188 size_t wxCSConv::WC2MB(char *buf
, const wchar_t *psz
, size_t n
) const
3190 CreateConvIfNeeded();
3193 return m_convReal
->WC2MB(buf
, psz
, n
);
3196 const size_t len
= wxWcslen(psz
);
3199 for (size_t c
= 0; c
<= len
; c
++)
3202 return wxCONV_FAILED
;
3204 buf
[c
] = (char)psz
[c
];
3209 for (size_t c
= 0; c
<= len
; c
++)
3212 return wxCONV_FAILED
;
3219 size_t wxCSConv::GetMBNulLen() const
3221 CreateConvIfNeeded();
3225 return m_convReal
->GetMBNulLen();
3228 // otherwise, we are ISO-8859-1
3232 #if wxUSE_UNICODE_UTF8
3233 bool wxCSConv::IsUTF8() const
3235 CreateConvIfNeeded();
3239 return m_convReal
->IsUTF8();
3242 // otherwise, we are ISO-8859-1
3250 wxWCharBuffer
wxSafeConvertMB2WX(const char *s
)
3253 return wxWCharBuffer();
3255 wxWCharBuffer
wbuf(wxConvLibc
.cMB2WX(s
));
3257 wbuf
= wxMBConvUTF8().cMB2WX(s
);
3259 wbuf
= wxConvISO8859_1
.cMB2WX(s
);
3264 wxCharBuffer
wxSafeConvertWX2MB(const wchar_t *ws
)
3267 return wxCharBuffer();
3269 wxCharBuffer
buf(wxConvLibc
.cWX2MB(ws
));
3271 buf
= wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL
).cWX2MB(ws
);
3276 #endif // wxUSE_UNICODE
3278 // ----------------------------------------------------------------------------
3280 // ----------------------------------------------------------------------------
3282 // NB: The reason why we create converted objects in this convoluted way,
3283 // using a factory function instead of global variable, is that they
3284 // may be used at static initialization time (some of them are used by
3285 // wxString ctors and there may be a global wxString object). In other
3286 // words, possibly _before_ the converter global object would be
3293 #undef wxConvISO8859_1
3295 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args) \
3296 WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL; \
3297 WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr() \
3299 static impl_klass name##Obj ctor_args; \
3300 return &name##Obj; \
3302 /* this ensures that all global converter objects are created */ \
3303 /* by the time static initialization is done, i.e. before any */ \
3304 /* thread is launched: */ \
3305 static klass* gs_##name##instance = wxGet_##name##Ptr()
3307 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3308 WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3311 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConv_win32
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3313 WX_DEFINE_GLOBAL_CONV2(wxMBConv
, wxMBConvLibc
, wxConvLibc
, wxEMPTY_PARAMETER_VALUE
);
3316 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3317 // passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3318 // provokes an error message about "not enough macro parameters"; and we
3319 // can't use "()" here as the name##Obj declaration would be parsed as a
3320 // function declaration then, so use a semicolon and live with an extra
3321 // empty statement (and hope that no compilers warns about this)
3322 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8
, wxConvUTF8
, ;);
3323 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7
, wxConvUTF7
, ;);
3325 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvLocal
, (wxFONTENCODING_SYSTEM
));
3326 WX_DEFINE_GLOBAL_CONV(wxCSConv
, wxConvISO8859_1
, (wxFONTENCODING_ISO8859_1
));
3328 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvCurrent
= wxGet_wxConvLibcPtr();
3329 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvUI
= wxGet_wxConvLocalPtr();
3332 // The xnu kernel always communicates file paths in decomposed UTF-8.
3333 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3334 static wxMBConv_cf
wxConvMacUTF8DObj(wxFONTENCODING_UTF8
);
3337 WXDLLIMPEXP_DATA_BASE(wxMBConv
*) wxConvFileName
=
3340 #else // !__DARWIN__
3341 wxGet_wxConvLibcPtr();
3342 #endif // __DARWIN__/!__DARWIN__
3344 #else // !wxUSE_WCHAR_T
3346 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3347 // stand-ins in absence of wchar_t
3348 WXDLLIMPEXP_DATA_BASE(wxMBConv
) wxConvLibc
,
3353 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T